| home | contents | previous | next page | send comment | send link | add bookmark |

scanner.cpp

/*
**  scanner.cpp
**
**  class definitions for a C/C++ lexical analyzer 
**
**  By: Stephen R. Schmitt
*/

#include "stdafx.h"                             // generated by Visual C++ ver.6
#include <ctype.h>
#include <string.h>
#include "scanner.h"

/*---------------------------------------------------------------------------*
**  "advance" sets curr_ch to the next character in the source file.
**
**  returns:  the current character
*/
char scanner::advance()
{
    // only i_src needs to be initialized
    curr_ch = source[i_src];
    next_ch = source[i_src + 1];
    i_src++;

    return curr_ch;
}

/*---------------------------------------------------------------------------*
**  "get_token" extracts the next token from the source file
**
**  returns:  nothing
*/
void scanner::get_token()
{
    int state = 1;

    previous = code;

    while( state )
    {
        advance();
        if( curr_ch == '\n' )
            line_count++;

        if( curr_ch == 0 )
        {
            code = END_OF_FILE;
            return;
        }

        switch( state )
        {
        case 1:
            state = 0;                                    // assume token
            lexeme[0] = curr_ch;
            i_lex = 1;

            if( isspace( curr_ch ) )                      // is white space 
                state = 1;
            else if( curr_ch == '/' && next_ch == '*' )   // is C style comment
                state = 2;
            else if( curr_ch == '/' && next_ch == '/' )   // is C++ style comment
                state = 3;
            else if( isalpha( curr_ch ) || curr_ch == '_' )
                get_word();
            else if( isdigit( curr_ch ) )
                get_number();
            else
                get_special();
            break;
            
        case 2:
            if( curr_ch == '*' && next_ch == '/' )        // end of C comment
            {
                advance();                                // to next_ch
                state = 1;
            }
            break;

        case 3:
            if( curr_ch == '\n' )                         // end of C++ comment
                state = 1;
            break;
        }
    }
}

LEX_TABLE Word_table[] =
{
    { "asm",                ASM },
    { "auto",               AUTO },
    { "bool",               BOOL },
    { "break",              BREAK },
    { "case",               CASE },
    { "catch",              CATCH },
    { "char",               CHAR },
    { "class",              CLASS },
    { "const",              CONST },
    { "const_cast",         CONST_CAST },
    { "continue",           CONTINUE },
    { "default",            DEFAULT },
    { "delete",             DELETE },
    { "do",                 DO },
    { "double",             DOUBLE },
    { "dynamic_cast",       DYNAMIC_CAST },
    { "else",               ELSE },
    { "enum",               ENUM },
    { "explicit",           EXPLICIT },
    { "extern",             EXTERN },
    { "false",              FALSE },
    { "float",              FLOAT },
    { "for",                FOR },
    { "friend",             FRIEND },
    { "goto",               GOTO },
    { "if",                 IF },
    { "inline",             INLINE },
    { "int",                INT },
    { "interrupt",          INTERRUPT },
    { "long",               LONG },
    { "mutable",            MUTABLE },
    { "namespace",          NAMESPACE },
    { "new",                NEW },
    { "operator",           OPERATOR },
    { "private",            PRIVATE },
    { "protected",          PROTECTED },
    { "public",             PUBLIC },
    { "register",           REGISTER },
    { "reinterpret_cast",   REINTERPRET_CAST },
    { "return",             RETURN },
    { "short",              SHORT },
    { "signed",             SIGNED },
    { "sizeof",             SIZEOF },
    { "static",             STATIC },
    { "static_cast",        STATIC_CAST },
    { "struct",             STRUCT },
    { "switch",             SWITCH },
    { "template",           TEMPLATE },
    { "this",               THIS },
    { "throw",              THROW },
    { "true",               TRUE },
    { "try",                TRY },
    { "typedef",            TYPEDEF },
    { "typeid",             TYPEID },
    { "typename",           TYPENAME },
    { "union",              UNION },
    { "unsigned",           UNSIGNED },
    { "using",              USING },
    { "virtual",            VIRTUAL },
    { "void",               VOID },
    { "volatile",           VOLATILE },
    { "while",              WHILE },
};

/*---------------------------------------------------------------------------*
**  "find" determines if a word is a keyword or an identifier. 
**
**  returns:  token code
*/
TOKEN_CODE scanner::find( LEX_TABLE table[],    // to search 
                          char *word,           // to search for
                          int high )            // items in table
{
    int low = 0;
    int cmp, mid;    
    
    while( low <= high )
    {
        mid = low + ( high - low ) / 2;
        cmp = strcmp( table[mid].lxm, word );

        if( cmp == 0 )                          // found keyword
            return table[mid].tkn;
        else if( cmp < 0 )
            low  = mid + 1;
        else
            high = mid - 1;
    }

    return ID_TOKEN;                            // did not find keyword
}

/*---------------------------------------------------------------------------*
**  "get_word" gets a word from the source. 
**  Assumes that first char is in token lexeme string.
**
**  returns:  nothing
*/
void scanner::get_word()
{
    int high = sizeof( Word_table ) / sizeof( LEX_TABLE ) - 1;

    on_line = line_count;

    while( isalpha( next_ch ) || isdigit( next_ch ) || next_ch == '_' )
        lexeme[i_lex++] = advance();

    lexeme[i_lex] = 0;

    code = find( Word_table, lexeme, high );
}

// used in get_number()
const int INTEGER   = 0;
const int INT_8     = 1;
const int INT_10    = 2;
const int INT_16    = 3;
const int FRACTION  = 4;
const int EXP_SIGN  = 5;
const int EXP_DIGIT = 6;
const int EXPONENT  = 7;
const int ALL_DONE  = 8;

/*---------------------------------------------------------------------------*
 *  "get_number" extracts a number token.  The first char is always a 
 *  digit.
 *
 *  returns: nothing
 */
void scanner::get_number()
{
    int state = INTEGER;                        // set defaults
    code    = INT_TOKEN;
    
    while( 1 )
    {   
        switch( state )
        {
        case INTEGER:
            if( '0' == curr_ch )
            {
                if( 'x' == next_ch || 'X' == next_ch )
                {
                    lexeme[i_lex++] = advance();
                    if( isxdigit( next_ch ) )
                    {
                        code  = HEX_TOKEN;
                        state = INT_16;
                    }
                    else
                        code  = ERR_TOKEN;
                }
                else if( '0' <= next_ch && next_ch < '8' )
                {
                    code  = OCT_TOKEN;
                    state = INT_8;
                }
                else if( isalnum( next_ch ) )
                    code  = ERR_TOKEN;
                else
                    state = ALL_DONE;
            }
            else if( isdigit( next_ch ) )
                state = INT_10;
            else if( isalpha( next_ch ) )
                code  = ERR_TOKEN;
            else
                state = ALL_DONE;
            break;

        case INT_16:
            if( isxdigit( next_ch ) )
                break;
            else if( isalpha( next_ch ) )
                code  = ERR_TOKEN;
            else
                state = ALL_DONE;
            break;
        
        case INT_8:
            if( '0' <= next_ch && next_ch < '8' )
                break;
            else if( isalnum( next_ch ) )
                code  = ERR_TOKEN;
            else
                state = ALL_DONE;
            break;
        
        
        case INT_10:
            if( 'e' == next_ch || 'E' == next_ch )
            {
                code  = REAL_TOKEN;
                state = EXP_SIGN;
            }
            else if( '.' == next_ch )
            {
                code  = REAL_TOKEN;
                state = FRACTION;
            }
            else if( !isdigit( next_ch ) )
                state = ALL_DONE;
            break;

        case FRACTION:
            if( 'e' == next_ch || 'E' == next_ch )
                state = EXP_SIGN;
            else if( !isdigit( next_ch ) )
                state = ALL_DONE;
            break;

        case EXP_SIGN:
            if( '+' == next_ch || '-' == next_ch )
                state = EXP_DIGIT;
            else if( isdigit( next_ch ) )
                state = EXPONENT;
            else
                code  = ERR_TOKEN;
            break;

        case EXP_DIGIT:
            if( isdigit( next_ch ) )
                state = EXPONENT;
            else
                code  = ERR_TOKEN;
            break;

        case EXPONENT:
            if( !isdigit( next_ch ) )
                state = ALL_DONE;
            break;
        }

        if( ALL_DONE == state || ERR_TOKEN == code )
            break;

        lexeme[i_lex++] = advance();
    }

    // complete token
    on_line        = line_count;
    lexeme[i_lex]  = 0;
}

/*---------------------------------------------------------------------------*
**  "get_special" gets a punctuation token from the source.
**  Assumes that first char is in token lexeme string.
**
**  returns:  nothing
*/
void scanner::get_special()
{
    switch( curr_ch )
    {
    case  0 :  get_special_eof();       break;
    case '=':  get_special_eq();        break;
    case '.':  code = PERIOD;           break;
    case ',':  code = COMMA;            break;
    case ':':  get_special_colon();     break;
    case ';':  code = SEMICOLON;        break;
    case '?':  code = QUESTION;         break;
    case '!':  get_special_not();       break;
    case '+':  get_special_plus();      break;
    case '-':  get_special_minus();     break;
    case '*':  get_special_times();     break;
    case '/':  get_special_div();       break;
    case '%':  get_special_mod();       break;
    case '>':  get_special_right();     break;
    case '<':  get_special_left();      break;
    case '&':  get_special_and();       break;
    case '|':  get_special_or();        break;
    case '^':  get_special_xor();       break;
    case '~':  code = COMP_BITS;        break;
    case '(':  code = LF_PAREN;         break;
    case ')':  code = RT_PAREN;         break;
    case '[':  code = LF_BRACKET;       break;
    case ']':  code = RT_BRACKET;       break;
    case '{':  code = LF_BRACE;         break;
    case '}':  code = RT_BRACE;         break;
    case '#':  get_special_pound();     break;
    case '\\': code = PP_SLASH;         break;
    case '\"': get_special_quote();     break;
    case '\'': get_character();         break;
    default:   code = ERR_TOKEN;
    }

    lexeme[i_lex] = 0;
    on_line = line_count;
}

/*---------------------------------------------------------------------------*
**  "get_special_eof"
**
**  returns:  nothing
*/
void scanner::get_special_eof()
{
    code = END_OF_FILE;
    strcpy( lexeme, "~eof" );
    i_lex = 4;
}

/*---------------------------------------------------------------------------*
**  "get_special_eq"
**
**  returns:  nothing
*/
void scanner::get_special_eq()
{
    if( next_ch == '=' )
    {
        lexeme[i_lex++] = advance();
        code = EQ;
    }
    else
        code = ASSIGN;
}

/*---------------------------------------------------------------------------*
**  "get_special_colon"
**
**  returns:  nothing
*/
void scanner::get_special_colon()
{
    if( next_ch == ':' )
    {
        lexeme[i_lex++]  = advance();
        code = COLON_COLON;
    }
    else
        code = COLON;
}

/*---------------------------------------------------------------------------*
**  "get_special_not"
**
**  returns:  nothing
*/
void scanner::get_special_not()
{
    if( next_ch == '=' )
    {
        lexeme[i_lex++]  = advance();
        code = NE;
    }
    else
        code = NOT;
}

/*---------------------------------------------------------------------------*
**  "get_special_plus"
**
**  returns:  nothing
*/
void scanner::get_special_plus()
{
    if( next_ch == '=' )
    {
        lexeme[i_lex++]  = advance();
        code = PLUS_EQ;
    }
    else if( next_ch == '+' )
    {
        lexeme[i_lex++]  = advance();
        code = INCR;
    }
    else
        code = PLUS;
}

/*---------------------------------------------------------------------------*
**  "get_special_minus"
**
**  returns:  nothing
*/
void scanner::get_special_minus()
{
    if( next_ch == '=' )
    {
        lexeme[i_lex++]  = advance();
        code = MINUS_EQ;
    }
    else if( next_ch == '-' )
    {
        lexeme[i_lex++]  = advance();
        code = DECR;
    }
    else if( next_ch == '>' )
    {
        lexeme[i_lex++]  = advance();
        code = ARROW;
    }
    else
        code = MINUS;
}

/*---------------------------------------------------------------------------*
**  "get_special_times"
**
**  returns:  nothing
*/
void scanner::get_special_times()
{
    if( next_ch == '=' )
    {
        lexeme[i_lex++]  = advance();        
        code = TIMES_EQ;
    }
    else
        code = TIMES;
}

/*---------------------------------------------------------------------------*
**  "get_special_div"
**
**  returns:  nothing
*/
void scanner::get_special_div()
{
    if( next_ch == '=' )
    {
        lexeme[i_lex++]  = advance();
        code = DIV_EQ;
    }
    else
        code = DIV;
}

/*---------------------------------------------------------------------------*
**  "get_special_mod"
**
**  returns:  nothing
*/
void scanner::get_special_mod()
{
    if( next_ch == '=' )
    {
        lexeme[i_lex++]  = advance();
        code = MOD_EQ;
    }
    else
        code = MOD;
}

/*---------------------------------------------------------------------------*
**  "get_special_right"
**
**  returns:  nothing
*/
void scanner::get_special_right()
{
    if( next_ch == '=' )
    {
        lexeme[i_lex++]  = advance();
        code = GE;
    }
    else if( next_ch == '>' )
    {
        lexeme[i_lex++]  = advance();
        code = RT_SHIFT_BITS;

        if( next_ch == '=' )
        {
            lexeme[i_lex++]  = advance();
            code = RT_SHIFT_EQ;
        }
    }
    else
        code = GT;
}

/*---------------------------------------------------------------------------*
**  "get_special_left"
**
**  returns:  nothing
*/
void scanner::get_special_left()
{
    if( previous == PP_INCLUDE )
        get_filename();
    else if( next_ch == '=' )
    {
        lexeme[i_lex++]  = advance();
        code = LE;
    }
    else if( next_ch == '<' )
    {
        lexeme[i_lex++]  = advance();
        code = LF_SHIFT_BITS;

        if( next_ch == '=' )
        {
            lexeme[i_lex++]  = advance();
            code = LF_SHIFT_EQ;
        }
    }
    else
        code = LT;
}

/*---------------------------------------------------------------------------*
**  "get_special_and"
**
**  returns:  nothing
*/
void scanner::get_special_and()
{
    if( next_ch == '=' )
    {
        lexeme[i_lex++]  = advance();
        code = AND_EQ;
    }
    else if( next_ch == '&' )
    {
        lexeme[i_lex++]  = advance();
        code = AND;
    }
    else
        code = AND_BITS;
}

/*---------------------------------------------------------------------------*
**  "get_special_or"
**
**  returns:  nothing
*/
void scanner::get_special_or()
{
    if( next_ch == '=' )
    {
        lexeme[i_lex++]  = advance();
        code = OR_EQ;
    }
    else if( next_ch == '|' )
    {
        lexeme[i_lex++]  = advance();
        code = OR;
    }
    else
        code = OR_BITS;
}

/*---------------------------------------------------------------------------*
**  "get_special_xor"
**
**  returns:  nothing
*/
void scanner::get_special_xor()
{
    if( next_ch == '=' )
    {
        lexeme[i_lex++]  = advance();
        code = XOR_EQ;
    }
    else
        code = XOR_BITS;
}

/*---------------------------------------------------------------------------*
**  "get_special_pound"
**
**  returns:  nothing
*/
void scanner::get_special_pound()
{
    if( next_ch == '#' )
    {
        lexeme[i_lex++] = advance();
        code = DOUBLE_POUND;
    }
    else
        get_preprocessor();
}

/*---------------------------------------------------------------------------*
**  "get_special_quote"
**
**  returns:  nothing
*/
void scanner::get_special_quote()
{
    if( previous == PP_INCLUDE )
        get_filename();
    else
        get_string();
}

LEX_TABLE command_table[] =
{
    { "#define",  PP_DEFINE  }, { "#elif",    PP_ELIF    },
    { "#else",    PP_ELSE    }, { "#endif",   PP_ENDIF   },
    { "#error",   PP_ERROR   }, { "#if",      PP_IF      },
    { "#ifdef",   PP_IFDEF   }, { "#ifndef",  PP_IFNDEF  },
    { "#include", PP_INCLUDE }, { "#pragma",  PP_PRAGMA  },
    { "#undef",   PP_UNDEF   },
};

/*---------------------------------------------------------------------------*
**  "get_preprocessor" gets a preprocessor command
**
**  returns:  nothing
*/
void scanner::get_preprocessor()
{
    int high = sizeof( command_table ) / sizeof( LEX_TABLE ) - 1;

    on_line = line_count;

    while( next_ch == ' ' || next_ch == '\t' )  // after #
        advance();

    lexeme[i_lex++] = advance();  

    while( isalpha( next_ch ) )
        lexeme[i_lex++] = advance();

    lexeme[i_lex] = 0;
  
    code = find( command_table, lexeme, high );
}

/*---------------------------------------------------------------------------*
**  "get_filename" gets an #include filename_ext from the source
**
**  returns:  nothing
*/
void scanner::get_filename()
{
    char last_ch;                               // end of string char

    if( curr_ch == '<' )
        last_ch = '>';
    else
        last_ch = '\"';

    do
        lexeme[i_lex++] = advance();
    while( curr_ch != last_ch );

    code    = ID_TOKEN;
    on_line = line_count;
}

/*---------------------------------------------------------------------------*
**  "get_string" gets a string from the source
**
**  returns:  nothing
*/
void scanner::get_string()
{
    do
    {
        if( curr_ch == '\\' )                   // get embedded "     
            lexeme[i_lex++] = advance();    

        lexeme[i_lex++] = advance();
    } 
    while( curr_ch != '\"' );

    code    = STR_TOKEN;
    on_line = line_count;
}

/*---------------------------------------------------------------------------*
 *  "get_character" gets a character token from the source
 *
 *  returns:  nothing
 */
void scanner::get_character()  
{
    lexeme[i_lex++] = advance();                // first char after '
    
    if( curr_ch == '\\' )                       // get embedded '
        lexeme[i_lex++] = advance();

    lexeme[i_lex++] = advance();                // last '
        
    code    = CHR_TOKEN;
    on_line = line_count;
}

| home | contents | previous | next page | send comment | send link | add bookmark |

Copyright © 2004, Stephen R. Schmitt