* Copyright 2006-2014 Haiku, Inc. All Rights Reserved.
* Distributed under the terms of the MIT License.
*
* Authors:
* Stephan Aßmus <superstippi@gmx.de>
* Rene Gollent <rene@gollent.com>
* John Scipione <jscipione@gmail.com>
* Ingo Weinhold <bonefish@cs.tu-berlin.de>
*/
#include "CLanguageTokenizer.h"
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
using CLanguage::ParseException;
using CLanguage::Token;
using CLanguage::Tokenizer;
Token::Token()
:
string(""),
type(TOKEN_NONE),
value(),
position(0)
{
}
Token::Token(const Token& other)
:
string(other.string),
type(other.type),
value(other.value),
position(other.position)
{
}
Token::Token(const char* string, int32 length, int32 position, int32 type)
:
string(string, length),
type(type),
value(),
position(position)
{
}
Token&
Token::operator=(const Token& other)
{
string = other.string;
type = other.type;
value = other.value;
position = other.position;
return *this;
}
Tokenizer::Tokenizer()
:
fString(""),
fCurrentChar(NULL),
fCurrentToken(),
fReuseToken(false)
{
}
void
Tokenizer::SetTo(const char* string)
{
fString = string;
fCurrentChar = fString.String();
fCurrentToken = Token();
fReuseToken = false;
}
const Token&
Tokenizer::NextToken()
{
if (fCurrentToken.type == TOKEN_END_OF_LINE)
return fCurrentToken;
if (fReuseToken) {
fReuseToken = false;
return fCurrentToken;
}
while (*fCurrentChar != 0 && isspace(*fCurrentChar))
fCurrentChar++;
if (*fCurrentChar == 0) {
return fCurrentToken = Token("", 0, _CurrentPos(),
TOKEN_END_OF_LINE);
}
bool decimal = *fCurrentChar == '.';
if (decimal || isdigit(*fCurrentChar)) {
if (*fCurrentChar == '0' && fCurrentChar[1] == 'x')
return _ParseHexOperand();
BString temp;
const char* begin = fCurrentChar;
while (isdigit(*fCurrentChar)) {
temp << *fCurrentChar;
fCurrentChar++;
}
if (*fCurrentChar == '.') {
decimal = true;
temp << '.';
fCurrentChar++;
while (isdigit(*fCurrentChar)) {
temp << *fCurrentChar;
fCurrentChar++;
}
}
int32 length = fCurrentChar - begin;
if (length == 1 && decimal) {
fCurrentChar = begin;
if (!_ParseOperator())
throw ParseException("unexpected character", _CurrentPos());
return fCurrentToken;
}
BString test = temp;
test << "&_";
double value;
char t[2];
int32 matches = sscanf(test.String(), "%lf&%s", &value, t);
if (matches != 2)
throw ParseException("error in constant", _CurrentPos() - length);
fCurrentToken = Token(begin, length, _CurrentPos() - length,
TOKEN_CONSTANT);
if (decimal)
fCurrentToken.value.SetTo(value);
else
fCurrentToken.value.SetTo((int64)strtoll(temp.String(), NULL, 10));
} else if (isalpha(*fCurrentChar) || *fCurrentChar == '_') {
const char* begin = fCurrentChar;
while (*fCurrentChar != 0 && (isalpha(*fCurrentChar)
|| isdigit(*fCurrentChar) || *fCurrentChar == '_')) {
fCurrentChar++;
}
int32 length = fCurrentChar - begin;
fCurrentToken = Token(begin, length, _CurrentPos() - length,
TOKEN_IDENTIFIER);
} else if (*fCurrentChar == '"' || *fCurrentChar == '\'') {
bool terminatorFound = false;
const char* begin = fCurrentChar++;
while (*fCurrentChar != 0) {
if (*fCurrentChar == '\\') {
if (*(fCurrentChar++) != 0)
fCurrentChar++;
} else if (*(fCurrentChar++) == *begin) {
terminatorFound = true;
break;
}
}
int32 tokenType = TOKEN_STRING_LITERAL;
if (!terminatorFound) {
tokenType = *begin == '"' ? TOKEN_DOUBLE_QUOTE
: TOKEN_SINGLE_QUOTE;
fCurrentChar = begin + 1;
}
int32 length = fCurrentChar - begin;
fCurrentToken = Token(begin, length, _CurrentPos() - length,
tokenType);
} else {
if (!_ParseOperator()) {
int32 type = TOKEN_NONE;
switch (*fCurrentChar) {
case '\n':
type = TOKEN_END_OF_LINE;
break;
case '(':
type = TOKEN_OPENING_PAREN;
break;
case ')':
type = TOKEN_CLOSING_PAREN;
break;
case '[':
type = TOKEN_OPENING_SQUARE_BRACKET;
break;
case ']':
type = TOKEN_CLOSING_SQUARE_BRACKET;
break;
case '{':
type = TOKEN_OPENING_CURLY_BRACE;
break;
case '}':
type = TOKEN_CLOSING_CURLY_BRACE;
break;
case '\\':
type = TOKEN_BACKSLASH;
break;
case ':':
type = TOKEN_COLON;
break;
case ';':
type = TOKEN_SEMICOLON;
break;
case ',':
type = TOKEN_COMMA;
break;
case '.':
type = TOKEN_PERIOD;
break;
case '#':
type = TOKEN_POUND;
break;
default:
throw ParseException("unexpected character",
_CurrentPos());
}
fCurrentToken = Token(fCurrentChar, 1, _CurrentPos(),
type);
fCurrentChar++;
}
}
return fCurrentToken;
}
bool
Tokenizer::_ParseOperator()
{
int32 type = TOKEN_NONE;
int32 length = 0;
switch (*fCurrentChar) {
case '+':
type = TOKEN_PLUS;
length = 1;
break;
case '-':
if (_Peek() == '>') {
type = TOKEN_MEMBER_PTR;
length = 2;
} else {
type = TOKEN_MINUS;
length = 1;
}
break;
case '*':
switch (_Peek()) {
case '/':
type = TOKEN_END_COMMENT_BLOCK;
length = 2;
break;
default:
type = TOKEN_STAR;
length = 1;
break;
}
break;
case '/':
switch (_Peek()) {
case '*':
type = TOKEN_BEGIN_COMMENT_BLOCK;
length = 2;
break;
case '/':
type = TOKEN_INLINE_COMMENT;
length = 2;
break;
default:
type = TOKEN_SLASH;
length = 1;
break;
}
break;
case '%':
type = TOKEN_MODULO;
length = 1;
break;
case '^':
type = TOKEN_BITWISE_XOR;
length = 1;
break;
case '&':
if (_Peek() == '&') {
type = TOKEN_LOGICAL_AND;
length = 2;
} else {
type = TOKEN_BITWISE_AND;
length = 1;
}
break;
case '|':
if (_Peek() == '|') {
type = TOKEN_LOGICAL_OR;
length = 2;
} else {
type = TOKEN_BITWISE_OR;
length = 1;
}
break;
case '!':
if (_Peek() == '=') {
type = TOKEN_NE;
length = 2;
} else {
type = TOKEN_LOGICAL_NOT;
length = 1;
}
break;
case '=':
if (_Peek() == '=') {
type = TOKEN_EQ;
length = 2;
} else {
type = TOKEN_ASSIGN;
length = 1;
}
break;
case '>':
if (_Peek() == '=') {
type = TOKEN_GE;
length = 2;
} else {
type = TOKEN_GT;
length = 1;
}
break;
case '<':
if (_Peek() == '=') {
type = TOKEN_LE;
length = 2;
} else {
type = TOKEN_LT;
length = 1;
}
break;
case '~':
type = TOKEN_BITWISE_NOT;
length = 1;
break;
case '?':
type = TOKEN_CONDITION;
length = 1;
break;
case '.':
type = TOKEN_MEMBER_PTR;
length = 1;
break;
default:
break;
}
if (length == 0)
return false;
fCurrentToken = Token(fCurrentChar, length, _CurrentPos(), type);
fCurrentChar += length;
return true;
}
void
Tokenizer::RewindToken()
{
fReuseToken = true;
}
char
Tokenizer::_Peek() const
{
if (_CurrentPos() < fString.Length())
return *(fCurrentChar + 1);
return '\0';
}
bool
Tokenizer::_IsHexDigit(char c)
{
return isdigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
}
Token&
Tokenizer::_ParseHexOperand()
{
const char* begin = fCurrentChar;
fCurrentChar += 2;
if (!_IsHexDigit(*fCurrentChar))
throw ParseException("expected hex digit", _CurrentPos());
fCurrentChar++;
while (_IsHexDigit(*fCurrentChar))
fCurrentChar++;
int32 length = fCurrentChar - begin;
fCurrentToken = Token(begin, length, _CurrentPos() - length,
TOKEN_CONSTANT);
if (length <= 10) {
fCurrentToken.value.SetTo((uint32)strtoul(
fCurrentToken.string.String(), NULL, 16));
} else {
fCurrentToken.value.SetTo((uint64)strtoull(
fCurrentToken.string.String(), NULL, 16));
}
return fCurrentToken;
}
int32
Tokenizer::_CurrentPos() const
{
return fCurrentChar - fString.String();
}