sencha-lang/Sencha-lang/Lexer.cpp

258 lines
5.5 KiB
C++

#include "Lexer.h"
Lexer::Lexer()
{
string keys[] = {"import", "class", "for", "while", "if", "else", "true", "false", "and", "or"};
keywords.assign(keys, keys+10);
char punct[] = {'.', ',', ';', '{', '}', '[', ']', '(', ')'};
punctuation.assign(punct, punct+9);
string oper[] = {"<", ">", "+", "-", "/", "*", "%", "&", "|", "=", ":", "==", "+=", "-=", "<=", ">=", "!="};
operators.assign(oper, oper +17);
}
Lexer::~Lexer()
{
//dtor
}
std::string Lexer::unescape_string(string text)
{
std::string result = text;
unsigned int offset = 0;
for(unsigned int i = 0; i < text.size(); i++)
{
char replacement = 0;
if(text[i] == '\\')
{
switch (text[i+1])
{
case 'n':
replacement = '\n';
break;
case 'r':
replacement = '\r';
break;
case 't':
replacement = '\t';
break;
case '\"':
replacement = '\"';
break;
case 39:
replacement = '\'';
break;
case '[':
replacement = 27;
break;
}
}
if(replacement != 0)
{
char replacement_chars[2];
replacement_chars[0] = replacement;
replacement_chars[1] = 0;
result.replace(i + offset, 2, replacement_chars);
offset--;
}
}
return result;
}
void Lexer::add_keyword(string word)
{
if(!is_keyword(word))
{
keywords.push_back(word);
}
}
void Lexer::add_punctuation_char(char c)
{
if(!is_punctuation(c))
{
punctuation.push_back(c);
}
}
void Lexer::add_operator(string oper)
{
if(!is_operator(oper))
{
operators.push_back(oper);
}
}
vector<Token> Lexer::parse_line(string line)
{
vector<Token> tokens;
while(line != "")
{
pair<string, Token> result_of_parsing = parse_token(line);
line = result_of_parsing.first;
Token token = result_of_parsing.second;
if(token.get_value() != "")
{
tokens.push_back(token);
}
}
return tokens;
}
pair<string, Token> Lexer::parse_token(string line)
{
string token_value = "";
unsigned int i;
for(i=0; i< line.size(); i++)
{
if(line[i] == '#')
{
i++;
while(line[i] != '\n' && i < line.size()) i++;
break;
}
if(token_value == "" && isspace(line[i])) continue;
if(isdigit(line[i]) || line[i] == '-')
{
token_value += line[i++];
for(; i < line.size(); i++)
{
if(isdigit(line[i]) || line[i] == '.') token_value += line[i];
else break;
}
}
if(line[i] == '\"')
{
token_value += line[i++];
for(; i < line.size() ; i++)
{
token_value += line[i];
if (line[i] == '\"')
{
i++;
break;
}
}
}
if(isalnum(line[i]) || line[i]== '_')
{
token_value += line[i];
}
else if(ispunct(line[i]))
{
if(token_value=="")
{
token_value=line[i];
i++;
if(i<line.size())
{
if(line[i] == '=')
{
token_value+=line[i];
i++;
}
}
}
break;
}
else break;
}
auto type = guess_type(token_value);
if(type == t_literal) token_value = unescape_string(token_value);
Token token = Token(type, token_value);
string truncated_line = line.substr(i);
return pair<string, Token>(truncated_line, token);
}
bool Lexer::is_keyword(string value)
{
for(unsigned int i=0; i< keywords.size(); i++)
{
if(value == keywords[i]) return true;
}
return false;
}
bool Lexer::is_punctuation(char c)
{
for(unsigned int i=0; i< punctuation.size(); i++)
{
if(c == punctuation[i]) return true;
}
return false;
}
bool Lexer::is_operator(string value)
{
for(unsigned int i=0; i< operators.size(); i++)
{
if(value == operators[i]) return true;
}
return false;
}
type_of_token Lexer::guess_type(string value)
{
/* I can have one of these types:
typedef enum { t_invalid_token=0, t_symbol, t_integer, t_literal,
t_punctuation, t_keyword } type_of_token;
*/
if(value == "") return t_invalid_token;
if(value.size() == 1 )
{
if(is_punctuation(value[0])) return t_punctuation;
else
{
if(is_operator(value)) return t_operator;
}
}
if(value.size() == 2 && is_operator(value)) return t_operator;
if(isdigit(value[0]) || value[0] == '-')
{
bool is_number = true;
bool dot_used = false;
for(unsigned int i=1; i<value.size(); i++)
{
if(value[i] == '.')
{
if(dot_used) return t_invalid_token;
dot_used = true;
continue;
}
if(!isdigit(value[i])) is_number = false;
}
if(is_number)
{
if(dot_used)
{
return t_float;
}
else return t_integer;
}
else return t_invalid_token;
}
if(isalpha(value[0]))
{
if(is_keyword(value)) return t_keyword;
else return t_symbol;
}
if(value[0]=='\"')
{
if(value[value.size()-1] == '\"') return t_literal;
else return t_invalid_token;
}
//If any...
return t_invalid_token;
}