| home
| contents
| previous
| next page
| send comment
| send link
| add bookmark |
scanner.cpp
/*
** scanner.cpp
**
** class definitions for a C/C++ lexical analyzer
**
** By: Stephen R. Schmitt
*/
#include "stdafx.h" // generated by Visual C++ ver.6
#include <ctype.h>
#include <string.h>
#include "scanner.h"
/*---------------------------------------------------------------------------*
** "advance" sets curr_ch to the next character in the source file.
**
** returns: the current character
*/
char scanner::advance()
{
// only i_src needs to be initialized
curr_ch = source[i_src];
next_ch = source[i_src + 1];
i_src++;
return curr_ch;
}
/*---------------------------------------------------------------------------*
** "get_token" extracts the next token from the source file
**
** returns: nothing
*/
void scanner::get_token()
{
int state = 1;
previous = code;
while( state )
{
advance();
if( curr_ch == '\n' )
line_count++;
if( curr_ch == 0 )
{
code = END_OF_FILE;
return;
}
switch( state )
{
case 1:
state = 0; // assume token
lexeme[0] = curr_ch;
i_lex = 1;
if( isspace( curr_ch ) ) // is white space
state = 1;
else if( curr_ch == '/' && next_ch == '*' ) // is C style comment
state = 2;
else if( curr_ch == '/' && next_ch == '/' ) // is C++ style comment
state = 3;
else if( isalpha( curr_ch ) || curr_ch == '_' )
get_word();
else if( isdigit( curr_ch ) )
get_number();
else
get_special();
break;
case 2:
if( curr_ch == '*' && next_ch == '/' ) // end of C comment
{
advance(); // to next_ch
state = 1;
}
break;
case 3:
if( curr_ch == '\n' ) // end of C++ comment
state = 1;
break;
}
}
}
LEX_TABLE Word_table[] =
{
{ "asm", ASM },
{ "auto", AUTO },
{ "bool", BOOL },
{ "break", BREAK },
{ "case", CASE },
{ "catch", CATCH },
{ "char", CHAR },
{ "class", CLASS },
{ "const", CONST },
{ "const_cast", CONST_CAST },
{ "continue", CONTINUE },
{ "default", DEFAULT },
{ "delete", DELETE },
{ "do", DO },
{ "double", DOUBLE },
{ "dynamic_cast", DYNAMIC_CAST },
{ "else", ELSE },
{ "enum", ENUM },
{ "explicit", EXPLICIT },
{ "extern", EXTERN },
{ "false", FALSE },
{ "float", FLOAT },
{ "for", FOR },
{ "friend", FRIEND },
{ "goto", GOTO },
{ "if", IF },
{ "inline", INLINE },
{ "int", INT },
{ "interrupt", INTERRUPT },
{ "long", LONG },
{ "mutable", MUTABLE },
{ "namespace", NAMESPACE },
{ "new", NEW },
{ "operator", OPERATOR },
{ "private", PRIVATE },
{ "protected", PROTECTED },
{ "public", PUBLIC },
{ "register", REGISTER },
{ "reinterpret_cast", REINTERPRET_CAST },
{ "return", RETURN },
{ "short", SHORT },
{ "signed", SIGNED },
{ "sizeof", SIZEOF },
{ "static", STATIC },
{ "static_cast", STATIC_CAST },
{ "struct", STRUCT },
{ "switch", SWITCH },
{ "template", TEMPLATE },
{ "this", THIS },
{ "throw", THROW },
{ "true", TRUE },
{ "try", TRY },
{ "typedef", TYPEDEF },
{ "typeid", TYPEID },
{ "typename", TYPENAME },
{ "union", UNION },
{ "unsigned", UNSIGNED },
{ "using", USING },
{ "virtual", VIRTUAL },
{ "void", VOID },
{ "volatile", VOLATILE },
{ "while", WHILE },
};
/*---------------------------------------------------------------------------*
** "find" determines if a word is a keyword or an identifier.
**
** returns: token code
*/
TOKEN_CODE scanner::find( LEX_TABLE table[], // to search
char *word, // to search for
int high ) // items in table
{
int low = 0;
int cmp, mid;
while( low <= high )
{
mid = low + ( high - low ) / 2;
cmp = strcmp( table[mid].lxm, word );
if( cmp == 0 ) // found keyword
return table[mid].tkn;
else if( cmp < 0 )
low = mid + 1;
else
high = mid - 1;
}
return ID_TOKEN; // did not find keyword
}
/*---------------------------------------------------------------------------*
** "get_word" gets a word from the source.
** Assumes that first char is in token lexeme string.
**
** returns: nothing
*/
void scanner::get_word()
{
int high = sizeof( Word_table ) / sizeof( LEX_TABLE ) - 1;
on_line = line_count;
while( isalpha( next_ch ) || isdigit( next_ch ) || next_ch == '_' )
lexeme[i_lex++] = advance();
lexeme[i_lex] = 0;
code = find( Word_table, lexeme, high );
}
// used in get_number()
const int INTEGER = 0;
const int INT_8 = 1;
const int INT_10 = 2;
const int INT_16 = 3;
const int FRACTION = 4;
const int EXP_SIGN = 5;
const int EXP_DIGIT = 6;
const int EXPONENT = 7;
const int ALL_DONE = 8;
/*---------------------------------------------------------------------------*
* "get_number" extracts a number token. The first char is always a
* digit.
*
* returns: nothing
*/
void scanner::get_number()
{
int state = INTEGER; // set defaults
code = INT_TOKEN;
while( 1 )
{
switch( state )
{
case INTEGER:
if( '0' == curr_ch )
{
if( 'x' == next_ch || 'X' == next_ch )
{
lexeme[i_lex++] = advance();
if( isxdigit( next_ch ) )
{
code = HEX_TOKEN;
state = INT_16;
}
else
code = ERR_TOKEN;
}
else if( '0' <= next_ch && next_ch < '8' )
{
code = OCT_TOKEN;
state = INT_8;
}
else if( isalnum( next_ch ) )
code = ERR_TOKEN;
else
state = ALL_DONE;
}
else if( isdigit( next_ch ) )
state = INT_10;
else if( isalpha( next_ch ) )
code = ERR_TOKEN;
else
state = ALL_DONE;
break;
case INT_16:
if( isxdigit( next_ch ) )
break;
else if( isalpha( next_ch ) )
code = ERR_TOKEN;
else
state = ALL_DONE;
break;
case INT_8:
if( '0' <= next_ch && next_ch < '8' )
break;
else if( isalnum( next_ch ) )
code = ERR_TOKEN;
else
state = ALL_DONE;
break;
case INT_10:
if( 'e' == next_ch || 'E' == next_ch )
{
code = REAL_TOKEN;
state = EXP_SIGN;
}
else if( '.' == next_ch )
{
code = REAL_TOKEN;
state = FRACTION;
}
else if( !isdigit( next_ch ) )
state = ALL_DONE;
break;
case FRACTION:
if( 'e' == next_ch || 'E' == next_ch )
state = EXP_SIGN;
else if( !isdigit( next_ch ) )
state = ALL_DONE;
break;
case EXP_SIGN:
if( '+' == next_ch || '-' == next_ch )
state = EXP_DIGIT;
else if( isdigit( next_ch ) )
state = EXPONENT;
else
code = ERR_TOKEN;
break;
case EXP_DIGIT:
if( isdigit( next_ch ) )
state = EXPONENT;
else
code = ERR_TOKEN;
break;
case EXPONENT:
if( !isdigit( next_ch ) )
state = ALL_DONE;
break;
}
if( ALL_DONE == state || ERR_TOKEN == code )
break;
lexeme[i_lex++] = advance();
}
// complete token
on_line = line_count;
lexeme[i_lex] = 0;
}
/*---------------------------------------------------------------------------*
** "get_special" gets a punctuation token from the source.
** Assumes that first char is in token lexeme string.
**
** returns: nothing
*/
void scanner::get_special()
{
switch( curr_ch )
{
case 0 : get_special_eof(); break;
case '=': get_special_eq(); break;
case '.': code = PERIOD; break;
case ',': code = COMMA; break;
case ':': get_special_colon(); break;
case ';': code = SEMICOLON; break;
case '?': code = QUESTION; break;
case '!': get_special_not(); break;
case '+': get_special_plus(); break;
case '-': get_special_minus(); break;
case '*': get_special_times(); break;
case '/': get_special_div(); break;
case '%': get_special_mod(); break;
case '>': get_special_right(); break;
case '<': get_special_left(); break;
case '&': get_special_and(); break;
case '|': get_special_or(); break;
case '^': get_special_xor(); break;
case '~': code = COMP_BITS; break;
case '(': code = LF_PAREN; break;
case ')': code = RT_PAREN; break;
case '[': code = LF_BRACKET; break;
case ']': code = RT_BRACKET; break;
case '{': code = LF_BRACE; break;
case '}': code = RT_BRACE; break;
case '#': get_special_pound(); break;
case '\\': code = PP_SLASH; break;
case '\"': get_special_quote(); break;
case '\'': get_character(); break;
default: code = ERR_TOKEN;
}
lexeme[i_lex] = 0;
on_line = line_count;
}
/*---------------------------------------------------------------------------*
** "get_special_eof"
**
** returns: nothing
*/
void scanner::get_special_eof()
{
code = END_OF_FILE;
strcpy( lexeme, "~eof" );
i_lex = 4;
}
/*---------------------------------------------------------------------------*
** "get_special_eq"
**
** returns: nothing
*/
void scanner::get_special_eq()
{
if( next_ch == '=' )
{
lexeme[i_lex++] = advance();
code = EQ;
}
else
code = ASSIGN;
}
/*---------------------------------------------------------------------------*
** "get_special_colon"
**
** returns: nothing
*/
void scanner::get_special_colon()
{
if( next_ch == ':' )
{
lexeme[i_lex++] = advance();
code = COLON_COLON;
}
else
code = COLON;
}
/*---------------------------------------------------------------------------*
** "get_special_not"
**
** returns: nothing
*/
void scanner::get_special_not()
{
if( next_ch == '=' )
{
lexeme[i_lex++] = advance();
code = NE;
}
else
code = NOT;
}
/*---------------------------------------------------------------------------*
** "get_special_plus"
**
** returns: nothing
*/
void scanner::get_special_plus()
{
if( next_ch == '=' )
{
lexeme[i_lex++] = advance();
code = PLUS_EQ;
}
else if( next_ch == '+' )
{
lexeme[i_lex++] = advance();
code = INCR;
}
else
code = PLUS;
}
/*---------------------------------------------------------------------------*
** "get_special_minus"
**
** returns: nothing
*/
void scanner::get_special_minus()
{
if( next_ch == '=' )
{
lexeme[i_lex++] = advance();
code = MINUS_EQ;
}
else if( next_ch == '-' )
{
lexeme[i_lex++] = advance();
code = DECR;
}
else if( next_ch == '>' )
{
lexeme[i_lex++] = advance();
code = ARROW;
}
else
code = MINUS;
}
/*---------------------------------------------------------------------------*
** "get_special_times"
**
** returns: nothing
*/
void scanner::get_special_times()
{
if( next_ch == '=' )
{
lexeme[i_lex++] = advance();
code = TIMES_EQ;
}
else
code = TIMES;
}
/*---------------------------------------------------------------------------*
** "get_special_div"
**
** returns: nothing
*/
void scanner::get_special_div()
{
if( next_ch == '=' )
{
lexeme[i_lex++] = advance();
code = DIV_EQ;
}
else
code = DIV;
}
/*---------------------------------------------------------------------------*
** "get_special_mod"
**
** returns: nothing
*/
void scanner::get_special_mod()
{
if( next_ch == '=' )
{
lexeme[i_lex++] = advance();
code = MOD_EQ;
}
else
code = MOD;
}
/*---------------------------------------------------------------------------*
** "get_special_right"
**
** returns: nothing
*/
void scanner::get_special_right()
{
if( next_ch == '=' )
{
lexeme[i_lex++] = advance();
code = GE;
}
else if( next_ch == '>' )
{
lexeme[i_lex++] = advance();
code = RT_SHIFT_BITS;
if( next_ch == '=' )
{
lexeme[i_lex++] = advance();
code = RT_SHIFT_EQ;
}
}
else
code = GT;
}
/*---------------------------------------------------------------------------*
** "get_special_left"
**
** returns: nothing
*/
void scanner::get_special_left()
{
if( previous == PP_INCLUDE )
get_filename();
else if( next_ch == '=' )
{
lexeme[i_lex++] = advance();
code = LE;
}
else if( next_ch == '<' )
{
lexeme[i_lex++] = advance();
code = LF_SHIFT_BITS;
if( next_ch == '=' )
{
lexeme[i_lex++] = advance();
code = LF_SHIFT_EQ;
}
}
else
code = LT;
}
/*---------------------------------------------------------------------------*
** "get_special_and"
**
** returns: nothing
*/
void scanner::get_special_and()
{
if( next_ch == '=' )
{
lexeme[i_lex++] = advance();
code = AND_EQ;
}
else if( next_ch == '&' )
{
lexeme[i_lex++] = advance();
code = AND;
}
else
code = AND_BITS;
}
/*---------------------------------------------------------------------------*
** "get_special_or"
**
** returns: nothing
*/
void scanner::get_special_or()
{
if( next_ch == '=' )
{
lexeme[i_lex++] = advance();
code = OR_EQ;
}
else if( next_ch == '|' )
{
lexeme[i_lex++] = advance();
code = OR;
}
else
code = OR_BITS;
}
/*---------------------------------------------------------------------------*
** "get_special_xor"
**
** returns: nothing
*/
void scanner::get_special_xor()
{
if( next_ch == '=' )
{
lexeme[i_lex++] = advance();
code = XOR_EQ;
}
else
code = XOR_BITS;
}
/*---------------------------------------------------------------------------*
** "get_special_pound"
**
** returns: nothing
*/
void scanner::get_special_pound()
{
if( next_ch == '#' )
{
lexeme[i_lex++] = advance();
code = DOUBLE_POUND;
}
else
get_preprocessor();
}
/*---------------------------------------------------------------------------*
** "get_special_quote"
**
** returns: nothing
*/
void scanner::get_special_quote()
{
if( previous == PP_INCLUDE )
get_filename();
else
get_string();
}
LEX_TABLE command_table[] =
{
{ "#define", PP_DEFINE }, { "#elif", PP_ELIF },
{ "#else", PP_ELSE }, { "#endif", PP_ENDIF },
{ "#error", PP_ERROR }, { "#if", PP_IF },
{ "#ifdef", PP_IFDEF }, { "#ifndef", PP_IFNDEF },
{ "#include", PP_INCLUDE }, { "#pragma", PP_PRAGMA },
{ "#undef", PP_UNDEF },
};
/*---------------------------------------------------------------------------*
** "get_preprocessor" gets a preprocessor command
**
** returns: nothing
*/
void scanner::get_preprocessor()
{
int high = sizeof( command_table ) / sizeof( LEX_TABLE ) - 1;
on_line = line_count;
while( next_ch == ' ' || next_ch == '\t' ) // after #
advance();
lexeme[i_lex++] = advance();
while( isalpha( next_ch ) )
lexeme[i_lex++] = advance();
lexeme[i_lex] = 0;
code = find( command_table, lexeme, high );
}
/*---------------------------------------------------------------------------*
** "get_filename" gets an #include filename_ext from the source
**
** returns: nothing
*/
void scanner::get_filename()
{
char last_ch; // end of string char
if( curr_ch == '<' )
last_ch = '>';
else
last_ch = '\"';
do
lexeme[i_lex++] = advance();
while( curr_ch != last_ch );
code = ID_TOKEN;
on_line = line_count;
}
/*---------------------------------------------------------------------------*
** "get_string" gets a string from the source
**
** returns: nothing
*/
void scanner::get_string()
{
do
{
if( curr_ch == '\\' ) // get embedded "
lexeme[i_lex++] = advance();
lexeme[i_lex++] = advance();
}
while( curr_ch != '\"' );
code = STR_TOKEN;
on_line = line_count;
}
/*---------------------------------------------------------------------------*
* "get_character" gets a character token from the source
*
* returns: nothing
*/
void scanner::get_character()
{
lexeme[i_lex++] = advance(); // first char after '
if( curr_ch == '\\' ) // get embedded '
lexeme[i_lex++] = advance();
lexeme[i_lex++] = advance(); // last '
code = CHR_TOKEN;
on_line = line_count;
}
| home
| contents
| previous
| next page
| send comment
| send link
| add bookmark |
Copyright © 2004, Stephen R. Schmitt