2012-11-02 23:07:00 +00:00
|
|
|
#include "Lexer.h"
|
|
|
|
|
|
|
|
Lexer::Lexer()
|
|
|
|
{
|
2012-12-09 16:42:13 +00:00
|
|
|
string keys[] = {"function", "class", "for", "while", "if", "else", "true", "false"};
|
|
|
|
keywords.assign(keys, keys+8);
|
2012-11-02 23:07:00 +00:00
|
|
|
|
|
|
|
char punct[] = {'.', ',', ';', '{', '}', '[', ']', '(', ')'};
|
|
|
|
punctuation.assign(punct, punct+9);
|
|
|
|
|
|
|
|
string oper[] = {"<", ">", "+", "-", "/", "*", "%", "&", "|", "=", ":", "==", "+=", "-=", "<=", ">=", "!=", "&&", "||"};
|
|
|
|
operators.assign(oper, oper +19);
|
|
|
|
}
|
|
|
|
|
|
|
|
Lexer::~Lexer()
|
|
|
|
{
|
|
|
|
//dtor
|
|
|
|
}
|
2012-12-16 09:36:19 +00:00
|
|
|
|
|
|
|
std::string Lexer::unescape_string(string text)
|
|
|
|
{
|
|
|
|
std::string result = text;
|
|
|
|
unsigned int offset = 0;
|
|
|
|
for(unsigned int i = 0; i < text.size(); i++)
|
|
|
|
{
|
|
|
|
char replacement = 0;
|
|
|
|
if(text[i] == '\\')
|
|
|
|
{
|
|
|
|
switch (text[i+1])
|
|
|
|
{
|
|
|
|
case 'n':
|
|
|
|
replacement = '\n';
|
|
|
|
break;
|
|
|
|
case 'r':
|
|
|
|
replacement = '\r';
|
|
|
|
break;
|
|
|
|
case 't':
|
|
|
|
replacement = '\t';
|
|
|
|
break;
|
|
|
|
case '[':
|
|
|
|
replacement = 27;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
if(replacement != 0)
|
|
|
|
{
|
|
|
|
char replacement_chars[2];
|
|
|
|
replacement_chars[0] = replacement;
|
|
|
|
replacement_chars[1] = 0;
|
|
|
|
|
|
|
|
result.replace(i + offset, 2, replacement_chars);
|
|
|
|
offset--;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
2012-11-02 23:07:00 +00:00
|
|
|
|
|
|
|
void Lexer::add_keyword(string word)
|
|
|
|
{
|
|
|
|
if(!is_keyword(word))
|
|
|
|
{
|
|
|
|
keywords.push_back(word);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
void Lexer::add_punctuation_char(char c)
|
|
|
|
{
|
|
|
|
if(!is_punctuation(c))
|
|
|
|
{
|
|
|
|
punctuation.push_back(c);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void Lexer::add_operator(string oper)
|
|
|
|
{
|
|
|
|
if(!is_operator(oper))
|
|
|
|
{
|
|
|
|
operators.push_back(oper);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
vector<Token> Lexer::parse_line(string line)
|
|
|
|
{
|
|
|
|
vector<Token> tokens;
|
|
|
|
while(line != "")
|
|
|
|
{
|
|
|
|
pair<string, Token> result_of_parsing = parse_token(line);
|
|
|
|
line = result_of_parsing.first;
|
|
|
|
Token token = result_of_parsing.second;
|
|
|
|
if(token.get_value() != "")
|
|
|
|
{
|
|
|
|
tokens.push_back(token);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return tokens;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
pair<string, Token> Lexer::parse_token(string line)
|
|
|
|
{
|
|
|
|
string token_value = "";
|
|
|
|
unsigned int i;
|
2012-12-16 08:24:16 +00:00
|
|
|
|
2012-11-17 15:54:16 +00:00
|
|
|
|
2012-11-02 23:07:00 +00:00
|
|
|
for(i=0; i< line.size(); i++)
|
|
|
|
{
|
|
|
|
if(token_value == "" && isspace(line[i])) continue;
|
2012-11-17 15:54:16 +00:00
|
|
|
|
2012-12-16 08:24:16 +00:00
|
|
|
if(isdigit(line[i]))
|
|
|
|
{
|
|
|
|
token_value += line[i++];
|
|
|
|
for(; i < line.size(); i++)
|
|
|
|
{
|
|
|
|
if(isdigit(line[i]) || line[i] == '.')
|
|
|
|
{
|
|
|
|
token_value += line[i];
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-11-17 15:54:16 +00:00
|
|
|
if(line[i] == '\"')
|
|
|
|
{
|
|
|
|
token_value += line[i++];
|
|
|
|
for(; i < line.size() ; i++)
|
|
|
|
{
|
|
|
|
token_value += line[i];
|
|
|
|
if (line[i] == '\"')
|
|
|
|
{
|
|
|
|
i++;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if(isalnum(line[i]) || line[i]== '_')
|
2012-11-02 23:07:00 +00:00
|
|
|
{
|
|
|
|
token_value += line[i];
|
2012-11-17 15:54:16 +00:00
|
|
|
|
|
|
|
}
|
|
|
|
|
2012-11-02 23:07:00 +00:00
|
|
|
else if(ispunct(line[i]))
|
|
|
|
{
|
|
|
|
if(token_value=="")
|
|
|
|
{
|
|
|
|
token_value=line[i];
|
|
|
|
i++;
|
|
|
|
if(i<line.size())
|
|
|
|
{
|
|
|
|
if(line[i] == '=')
|
|
|
|
{
|
|
|
|
token_value+=line[i];
|
|
|
|
i++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
2012-11-17 15:54:16 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2012-11-02 23:07:00 +00:00
|
|
|
else break;
|
|
|
|
}
|
2012-12-16 09:36:19 +00:00
|
|
|
auto type = guess_type(token_value);
|
|
|
|
if(type == t_literal) token_value = unescape_string(token_value);
|
2012-11-02 23:07:00 +00:00
|
|
|
|
2012-12-16 09:36:19 +00:00
|
|
|
Token token = Token(type, token_value);
|
2012-11-02 23:07:00 +00:00
|
|
|
string truncated_line = line.substr(i);
|
|
|
|
|
|
|
|
return pair<string, Token>(truncated_line, token);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Lexer::is_keyword(string value)
|
|
|
|
{
|
2012-12-08 19:59:05 +00:00
|
|
|
for(unsigned int i=0; i< keywords.size(); i++)
|
2012-11-02 23:07:00 +00:00
|
|
|
{
|
|
|
|
if(value == keywords[i]) return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Lexer::is_punctuation(char c)
|
|
|
|
{
|
|
|
|
|
2012-12-08 19:59:05 +00:00
|
|
|
for(unsigned int i=0; i< punctuation.size(); i++)
|
2012-11-02 23:07:00 +00:00
|
|
|
{
|
|
|
|
if(c == punctuation[i]) return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Lexer::is_operator(string value)
|
|
|
|
{
|
2012-12-08 19:59:05 +00:00
|
|
|
for(unsigned int i=0; i< operators.size(); i++)
|
2012-11-02 23:07:00 +00:00
|
|
|
{
|
|
|
|
if(value == operators[i]) return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2012-11-04 13:26:36 +00:00
|
|
|
type_of_token Lexer::guess_type(string value)
|
2012-11-02 23:07:00 +00:00
|
|
|
{
|
|
|
|
/* I can have one of these types:
|
|
|
|
typedef enum { t_invalid_token=0, t_symbol, t_integer, t_literal,
|
|
|
|
t_punctuation, t_keyword } type_of_token;
|
|
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
if(value == "") return t_invalid_token;
|
|
|
|
if(isdigit(value[0]))
|
|
|
|
{
|
2012-12-16 08:24:16 +00:00
|
|
|
bool is_number = true;
|
|
|
|
bool dot_used = false;
|
|
|
|
for(unsigned int i=0; i<value.size(); i++)
|
|
|
|
{
|
|
|
|
if(value[i] == '.')
|
|
|
|
{
|
|
|
|
if(dot_used) return t_invalid_token;
|
|
|
|
dot_used = true;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if(!isdigit(value[i])) is_number = false;
|
2012-11-02 23:07:00 +00:00
|
|
|
}
|
|
|
|
|
2012-12-16 08:24:16 +00:00
|
|
|
if(is_number)
|
|
|
|
{
|
|
|
|
if(dot_used)
|
|
|
|
{
|
|
|
|
return t_float;
|
|
|
|
}
|
|
|
|
else return t_integer;
|
|
|
|
}
|
|
|
|
|
2012-11-02 23:07:00 +00:00
|
|
|
else return t_invalid_token;
|
|
|
|
}
|
|
|
|
if(isalpha(value[0]))
|
|
|
|
{
|
|
|
|
if(is_keyword(value)) return t_keyword;
|
|
|
|
else return t_symbol;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
if(value[0]=='\"')
|
|
|
|
{
|
|
|
|
if(value[value.size()-1] == '\"') return t_literal;
|
|
|
|
else return t_invalid_token;
|
|
|
|
}
|
|
|
|
|
|
|
|
if(value.size() == 1 )
|
|
|
|
{
|
|
|
|
if(is_punctuation(value[0])) return t_punctuation;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if(is_operator(value)) return t_operator;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if(value.size() == 2 && is_operator(value)) return t_operator;
|
|
|
|
|
|
|
|
//If any...
|
|
|
|
return t_invalid_token;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|