sencha-lang/Sencha-lang/Lexer.cpp

#include "Lexer.h"

Lexer::Lexer()
{
    string keys[] = {"function", "class", "for", "while", "if", "else"};
    keywords.assign(keys, keys+6);

    char punct[] = {'.', ',', ';', '{', '}', '[', ']', '(', ')'};
    punctuation.assign(punct, punct+9);

    string oper[] = {"<", ">", "+", "-", "/", "*", "%", "&", "|", "=", ":", "==", "+=", "-=", "<=", ">=", "!=", "&&", "||"};
    operators.assign(oper, oper +19);
}

Lexer::~Lexer()
{
    //dtor
}

void Lexer::add_keyword(string word)
{
    if(!is_keyword(word))
    {
        keywords.push_back(word);
    }
}
void Lexer::add_punctuation_char(char c)
{
    if(!is_punctuation(c))
    {
        punctuation.push_back(c);
    }
}

void Lexer::add_operator(string oper)
{
    if(!is_operator(oper))
    {
        operators.push_back(oper);
    }
}


vector<Token> Lexer::parse_line(string line)
{
    vector<Token> tokens;
    while(line != "")
    {
        pair<string, Token> result_of_parsing = parse_token(line);
        line = result_of_parsing.first;
        Token token = result_of_parsing.second;
        if(token.get_value() != "")
        {
            tokens.push_back(token);
        }
    }
    return tokens;

}

pair<string, Token> Lexer::parse_token(string line)
{
    string token_value = "";
    unsigned int i;
    bool in_char_literal = false;
    for(i=0; i< line.size(); i++)
    {
        if(token_value == "" && isspace(line[i])) continue;

        if(isalnum(line[i]) || line[i] == '\"' || line[i]== '_')
        {
            token_value += line[i];
            if(line[i] == '\"')
            {
                if(in_char_literal)
                {
                    in_char_literal = false;
                }
                else in_char_literal = true;
            }
        }
        else if(ispunct(line[i]))
        {
            if(token_value=="")
            {
                token_value=line[i];
                i++;
                if(i<line.size())
                {
                    if(line[i] == '=')
                    {
                        token_value+=line[i];
                        i++;
                    }
                }
            }
            break;
        }
        else if (in_char_literal && isspace(line[i]))
        {
            token_value += line[i];
        }
        else break;
    }

    Token token = Token(guess_type(token_value), token_value);
    string truncated_line = line.substr(i);

    return pair<string, Token>(truncated_line, token);
}

bool Lexer::is_keyword(string value)
{
    for(int i=0; i< keywords.size(); i++)
    {
        if(value == keywords[i]) return true;
    }
    return false;
}

bool Lexer::is_punctuation(char c)
{

    for(int i=0; i< punctuation.size(); i++)
    {
        if(c == punctuation[i]) return true;
    }
    return false;
}

bool Lexer::is_operator(string value)
{
    for(int i=0; i< operators.size(); i++)
    {
        if(value == operators[i]) return true;
    }
    return false;
}

type_of_token Lexer::guess_type(string value)
{
    /* I can have one of these types:
    typedef enum {  t_invalid_token=0, t_symbol, t_integer, t_literal,
                t_punctuation, t_keyword } type_of_token;

    */

    if(value == "") return t_invalid_token;
    if(isdigit(value[0]))
       {
           bool is_integer = true;
           for(int i=1; i<value.size(); i++)
           {
             if(!isdigit(value[i]))  is_integer = false;
           }

           if(is_integer) return t_integer;
           else return t_invalid_token;
       }
    if(isalpha(value[0]))
    {
        if(is_keyword(value)) return t_keyword;
        else return t_symbol;

    }

    if(value[0]=='\"')
    {
        if(value[value.size()-1] == '\"') return t_literal;
        else return t_invalid_token;
    }

    if(value.size() == 1 )
    {
        if(is_punctuation(value[0])) return t_punctuation;
        else
        {
            if(is_operator(value)) return t_operator;
        }
    }
    if(value.size() == 2 && is_operator(value)) return t_operator;

    //If any...
    return t_invalid_token;
}