| home
| contents
| previous
| next page
| send comment
| send link
| add bookmark |
SCN_MAIN.CPP
text scanner
/*-------------------------------------------------------------------*
Scanner module analyzes source file for T tokens
File: scn_main.cpp
Module: scanner
by: Stephen R. Schmitt
*-------------------------------------------------------------------*/
#include "tpl_data.h"
#include <assert.h>
#include <ctype.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/*
* Key word tables
*/
typedef struct
{
char *string;
TOKEN_CODE token_code;
}
KW_STRUCT;
KW_STRUCT kw_2[] = {
{"do", DO}, {"if", IF}, {"ln", LN},
{"of", OF}, {"or", OR}, {NULL, NO_TOKEN}};
KW_STRUCT kw_3[] = {
{"abs", ABS}, {"and", AND}, {"chr", CHR }, {"cos", COS},
{"div", DIV}, {"end", END}, {"eof", EOFF}, {"exp", EXP},
{"for", FOR}, {"get", GET}, {"int", INT }, {"max", MAX},
{"min", MIN}, {"mod", MOD}, {"nor", NOR }, {"not", NOT},
{"ord", ORD}, {"put", PUT}, {"sin", SIN }, {"tan", TAN},
{"var", VAR}, {"xor", XOR},
{NULL, NO_TOKEN}};
KW_STRUCT kw_4[] = {
{"case", CASE}, {"ceil", CEIL}, {"char", CHAR},
{"cosh", COSH}, {"else", ELSE}, {"enum", ENUM},
{"exit", EXIT}, {"goto", GOTO}, {"loop", LOOP},
{"log2", LOG2}, {"nand", NAND}, {"open", OPEN},
{"pred", PRED}, {"rand", RAND}, {"real", REAL},
{"sign", SIGN}, {"sinh", SINH}, {"sqrt", SQRT},
{"succ", SUCC}, {"tanh", TANH}, {"then", THEN},
{"true", TRUE}, {"type", TYPE}, {"when", WHEN},
{NULL, NO_TOKEN}};
KW_STRUCT kw_5[] = {
{"array", ARRAY}, {"close", CLOSE}, {"const", CONST},
{"elsif", ELSIF}, {"false", FALSE}, {"floor", FLOOR},
{"index", INDEX}, {"label", LABEL}, {"log10", LOG10},
{"putch", PUTCH}, {"round", ROUND}, {"union", UNION},
{"value", VALUE}, {"watch", WATCH}, {NULL, NO_TOKEN}};
KW_STRUCT kw_6[] = {
{"arccos", ARCCOS}, {"arcsin", ARCSIN}, {"arctan", ARCTAN},
{"assert", ASSERT}, {"putstr", PUTSTR}, {"cursor", CURSOR},
{"getexp", GETEXP}, {"getkey", GETKEY}, {"intstr", INTSTR},
{"length", LENGTH}, {"locate", LOCATE}, {"scroll", SCROLL},
{"record", RECORD}, {"repeat", REPEAT}, {"return", RETURN},
{"setexp", SETEXP}, {"string", STRING}, {"strint", STRINT},
{NULL, NO_TOKEN}};
KW_STRUCT kw_7[] = {
{"boolean", BOOLEAN}, {"include", INCLUDE},
{"intreal", INTREAL}, {"program", PROGRAM},
{"realstr", REALSTR}, {"randint", RANDINT},
{"putline", PUTLINE}, {"strreal", STRREAL},
{NULL, NO_TOKEN}};
KW_STRUCT kw_8[] = {
{"arctanxy", ARCTANXY}, {"continue", CONTINUE},
{"erealstr", EREALSTR}, {"frealstr", FREALSTR},
{"function", FUNCTION}, {"randseed", RANDSEED},
{"putpixel", PUTPIXEL}, {"setvideo", SETVIDEO},
{NULL, NO_TOKEN}};
KW_STRUCT kw_9[] = {
{"procedure", PROCEDURE}, {"randomize", RANDOMIZE},
{"videomode", VIDEOMODE}, {"videotype", VIDEOTYPE},
{NULL, NO_TOKEN}};
KW_STRUCT kw_10[] = {
{"decreasing", DECREASING}, {NULL, NO_TOKEN}};
KW_STRUCT *kw_table[] = {
NULL, NULL, kw_2, kw_3, kw_4, kw_5,
kw_6, kw_7, kw_8, kw_9, kw_10};
/*
* Token lists used in parsing
*/
TOKEN_CODE Stmt_start_list[] = {
ID_TOKEN, IF, LOOP, CASE, FOR, ASSERT, RETURN, EXIT, CONTINUE,
GOTO, PUT, GET, WATCH, PUTCH, PUTSTR, PUTLINE, PUTPIXEL,
CURSOR, LOCATE, RANDOMIZE, RANDSEED, SCROLL, SETVIDEO, NO_TOKEN};
TOKEN_CODE Stmt_end_list[] = {
END, ELSE, ELSIF, END_OF_FILE, LABEL, NO_TOKEN};
TOKEN_CODE Decl_start_list[] = {
PROGRAM, PROCEDURE, FUNCTION, CONST, VAR, TYPE, LABEL, NO_TOKEN};
TOKEN_CODE Follow_dimension_list[] = { // for array declarations
COMMA, OF, END_OF_FILE, NO_TOKEN};
TOKEN_CODE Pass_one_list[] = {
PROGRAM, PROCEDURE, FUNCTION, CONST, VAR, TYPE, INCLUDE, NO_TOKEN};
TOKEN_CODE Pass_two_list[] = {
PROGRAM, PROCEDURE, FUNCTION, NO_TOKEN};
TOKEN_CODE Pass_end_list[] = {
END_OF_FILE, NO_TOKEN};
TOKEN_CODE Data_decl_list[] = {
CONST, VAR, TYPE, LABEL, NO_TOKEN};
TOKEN_CODE Rtne_stmt_list[] = {
ID_TOKEN, IF, LOOP, CASE, FOR, ASSERT, RETURN,
GOTO, GET, PUT, WATCH, PUTCH, PUTSTR, PUTLINE, PUTPIXEL,
CURSOR, LOCATE, RANDOMIZE, RANDSEED, SCROLL, SETVIDEO, NO_TOKEN};
TOKEN_CODE Rtne_end_list[] = {
END, END_OF_FILE, NO_TOKEN};
TOKEN_CODE Loop_end_list[] = {
END, END_OF_FILE, NO_TOKEN};
TOKEN_CODE For_end_list[] = {
END, END_OF_FILE, NO_TOKEN};
TOKEN_CODE If_end1_list[] = {
ELSIF, ELSE, END, END_OF_FILE, NO_TOKEN};
TOKEN_CODE If_end2_list[] = {
END, END_OF_FILE, NO_TOKEN};
TOKEN_CODE Case_label_list[] = {
ID_TOKEN, INT_TOKEN, CHAR_TOKEN, STR_TOKEN, PLUS, MINUS, NO_TOKEN};
TOKEN_CODE Case_end_list[] = {
VALUE, END, END_OF_FILE, NO_TOKEN};
TOKEN_CODE Put_item_list[] = {
ID_TOKEN, INT_TOKEN, REAL_TOKEN,
STR_TOKEN, CHAR_TOKEN, CHR, NO_TOKEN};
TOKEN_CODE Rel_op_list[] = {
EQ, NE, LT, LE, GT, GE, NO_TOKEN};
TOKEN_CODE Function_list[] = {
ABS, ARCCOS, ARCSIN, ARCTAN, ARCTANXY,
CEIL, CHR, CLOSE, COS, COSH,
EOFF, EREALSTR, EXP,
FLOOR, FREALSTR,
GETEXP, GETKEY,
INDEX, INTREAL, INTSTR,
LENGTH, LN, LOG10, LOG2,
MAX, MIN, OPEN, ORD, PRED,
RAND, RANDINT, REALSTR, REPEAT, ROUND,
SETEXP, SIGN, SIN, SINH, SQRT, STRINT, STRREAL, SUCC,
TAN, TANH,
VIDEOMODE, VIDEOTYPE, NO_TOKEN};
TOKEN_CODE Number_type_list[] = {
ABS, ARCCOS, ARCSIN, ARCTAN, ARCTANXY,
CEIL, COS, COSH,
EXP,
FLOOR,
GETEXP, GETKEY,
INDEX, INTREAL,
LENGTH, LN, LOG10, LOG2,
MAX, MIN, OPEN, ORD,
RAND, RANDINT, ROUND,
SETEXP, SIGN, SIN, SINH, SQRT, STRINT, STRREAL,
TAN, TANH,
VIDEOMODE, VIDEOTYPE, NO_TOKEN};
TOKEN_CODE String_type_list[] = {
EREALSTR, FREALSTR, INTSTR, REALSTR, REPEAT,
STR_TOKEN, NO_TOKEN};
/*
* Global variables
*/
token_struct Token; // current token
token_struct Next_token; // lookahead token
char Ch; // input character
int Line_count; // total lines compiled
FILE *Source[MAX_FILES]; // source file handle
char *Filename[MAX_FILES]; // source file name
char *Buffer[MAX_FILES]; // input buffer
char *Buffer_ptr[MAX_FILES]; // source buffer pointer
int Buffer_offset[MAX_FILES]; // offset into source buffer
int Token_offset; // offset of token
int Line_number[MAX_FILES]; // source line number
int File_number; // source file number
/*-------------------------------------------------------------------*
Code Section
*-------------------------------------------------------------------*/
/*
* "init_scanner" initializes the character table
*
* returns: true if successful, else false
*/
bool init_scanner(char *name) // of source file
{
int i; // counter
File_number = 0; // first file
bool result; // true if success
// allocate memory for buffers
for (i = 0; i < MAX_FILES; i++)
{
Filename[i] = new char[MAX_STRING_LENGTH];
memset(Filename[i], '\0', MAX_STRING_LENGTH);
Buffer[i] = new char[MAX_STRING_LENGTH];
memset(Buffer[i], '\0', MAX_STRING_LENGTH);
}
// open the first source file
if (!open_source_file(name))
result = false;
else
{
// initialize line number and prime get_char
Line_count = 0;
Line_number[File_number] = 0;
Buffer_ptr[File_number] = Buffer[File_number];
Buffer_ptr[File_number] = "";
get_char();
Token.code = NO_TOKEN;
Next_token.code = NO_TOKEN;
get_token(); // first token
result = true;
}
return result;
}
/*
* "open_source_file" opens a source file.
*
* returns: nothing
*/
bool open_source_file(char *name) // of source file
{
bool result; // true if success
// open the source file
strcpy(Filename[File_number], name);
Source[File_number] = fopen(name, "r");
if (Source[File_number] == NULL)
result = false;
else
result = true;
return result;
}
/*
* "quit_scanner" terminates the scanner.
*
* returns: nothing
*/
void quit_scanner()
{
// de-allocate memory for buffers
for (int i = 0; i < MAX_FILES; i++)
{
assert(Filename[i] != NULL);
delete Filename[i];
assert(Buffer[i] != NULL);
delete Buffer[i];
}
// close source file
fclose(Source[File_number]);
}
/*
* "get_char" sets global variable "Ch" to the next character
* from the source buffer.
*
* returns: nothing
*/
void get_char()
{
if (*Buffer_ptr[File_number] == NULL) // at end of buffer?
{
if (!get_source_line()) // at end of file?
{
Ch = EOF_CHAR;
return;
}
Buffer_ptr[File_number] = Buffer[File_number];
Buffer_offset[File_number] = 0;
}
Ch = *Buffer_ptr[File_number]; // get next character
Buffer_ptr[File_number]++;
// convert newlines, tabs and comments to spaces
switch (Ch)
{
case '\t': // tab
Buffer_offset[File_number]
+= TAB_SIZE - Buffer_offset[File_number] % TAB_SIZE;
Ch = ' ';
break;
case '\n': // newline
Buffer_offset[File_number]++;
Ch = ' ';
break;
case '%': // comment line
if (!get_source_line())
Ch = EOF_CHAR;
else
{
Buffer_ptr[File_number]
= Buffer[File_number];
Buffer_offset[File_number] = 0;
Ch = ' ';
}
break;
default: // all others
Buffer_offset[File_number]++;
break;
}
}
/*
* "get_token" updates the current token
*
* returns: nothing
*/
void get_token()
{
while ((Next_token.code == INCLUDE )||
((Next_token.code == END_OF_FILE)&&
(File_number > 0 )))
{
if (Next_token.code == INCLUDE)
{
emit_statement_marker(Next_token.file, Next_token.line);
include_file();
}
if ((Next_token.code == END_OF_FILE) && (File_number > 0))
delete_file();
get_next_token();
}
Token = Next_token;
if (Token.code != END_OF_FILE)
get_next_token();
}
/*
* "get_next_token" gets the next token from the source file.
* Global variable "Ch" is set to the first character after the
* current token in the source file.
*
* returns: nothing
*/
void get_next_token()
{
while (Ch == ' ') // skip over white space
get_char();
Token_offset = Buffer_offset[File_number];
if (isalpha(Ch))
get_word();
else if (isdigit(Ch))
get_number();
else if (Ch == '\"')
get_string();
else if (Ch == '\'')
get_character();
else if (Ch == EOF_CHAR)
{
Next_token.code = END_OF_FILE; // set token
Next_token.lexeme[0] = NULL; // null terminate
}
else
get_special();
strcpy(Next_token.file, Filename[File_number]);
Next_token.line = Line_number[File_number];
Next_token.column = Token_offset;
}
/*
* "get_word" extracts a word token. Digits and the character '_'
* are permitted within a word but may not be the first character.
*
* returns: nothing
*/
void get_word()
{
int i = 0;
while (isalnum(Ch)||(Ch == '_'))
{
Next_token.lexeme[i] = Ch; // add to token_string
i++;
get_char(); // get next character
}
Next_token.lexeme[i] = NULL; // null terminate
if (!is_key_word())
Next_token.code = ID_TOKEN;
}
/*
* "is_key_word" checks to see if a word is in key word table.
*
* returns: true if it is, false if not
*/
bool is_key_word()
{
int word_length = strlen(Next_token.lexeme);
KW_STRUCT *kwp;
if ((word_length >= MIN_KEY_WORD_LENGTH)&&
(word_length <= MAX_KEY_WORD_LENGTH))
{
for (kwp = kw_table[word_length]; kwp->string != NULL; kwp++)
{
if (strcmp(Next_token.lexeme, kwp->string) == 0)
{
// matches a token
Next_token.code = kwp->token_code;
return true;
}
}
}
return false;
}
/*
* "get_number" extracts a number token
*
* returns: nothing
*/
void get_number()
{
char buffer[TOKEN_STRING_LENGTH]; // number token buffer
Next_token.code = INT_TOKEN; // defaults
// extract the whole part of the number
int i = 0;
while (isdigit(Ch))
{
buffer[i] = Ch;
get_char();
i++;
}
buffer[i] = NULL;
// look for decimal or exponent symbol
if (Ch == '.')
{
get_char();
if (Ch == '.')
{
Buffer_ptr[File_number]--;
buffer[i] = NULL;
goto get_number_done;
}
else
{
buffer[i] = '.';
i++;
}
Next_token.code = REAL_TOKEN;
while (isdigit(Ch))
{
buffer[i] = Ch;
get_char();
i++;
}
buffer[i] = NULL;
}
if ((Ch == 'E') || (Ch == 'e'))
{
Next_token.code = REAL_TOKEN;
buffer[i] = Ch ;
get_char();
i++;
if ((Ch == '+') || (Ch == '-'))
{
buffer[i] = Ch ;
get_char();
i++;
}
if (!isdigit(Ch))
{
compile_error(M_INVALID, M_NUMBER, M_0);
Next_token.code = ERR_TOKEN;
}
while (isdigit(Ch))
{
buffer[i] = Ch ;
get_char();
i++;
}
buffer[i] = NULL;
}
get_number_done:
if (Next_token.code == INT_TOKEN)
Next_token.integer = atol(buffer);
else
Next_token.real = atof(buffer);
if (errno == ERANGE)
compile_error(M_REAL, M_OUT, M_OF, M_RANGE, M_0);
}
/*
* "get_string" extracts a string token from the source buffer.
*
* returns: nothing
*/
void get_string()
{
int i = 0; // index for literal
Ch = *Buffer_ptr[File_number]; // first char in string
Buffer_ptr[File_number]++;
while ((Ch != EOF_CHAR)&&(Ch != NULL)) // extract the string
{
if (Ch == '\"') // ending quote?
{
get_char();
break;
}
if (Ch == '\\') // backslash character?
{
Ch = *Buffer_ptr[File_number];
Buffer_ptr[File_number]++;
i = get_backslash(i);
}
Next_token.lexeme[i] = Ch; // add to literal
i++;
Ch = *Buffer_ptr[File_number]; // next char in string
Buffer_ptr[File_number]++;
}
Next_token.lexeme[i] = NULL;
Next_token.code = STR_TOKEN;
}
/*
* "get_character" extracts a character token from the source buffer.
*
* returns: nothing
*/
void get_character()
{
int i = 0;
Ch = *Buffer_ptr[File_number]; // the character
Buffer_ptr[File_number]++;
if (Ch == '\\') // backslash character?
{
Ch = *Buffer_ptr[File_number];
Buffer_ptr[File_number]++;
i = get_backslash(i);
}
Next_token.lexeme[i] = Ch; // add to literal
i++;
Ch = *Buffer_ptr[File_number]; // next char
Buffer_ptr[File_number]++;
if (Ch != '\'')
compile_error(M_SYNTAX, M_ERROR, M_0);
get_char();
Next_token.lexeme[i] = NULL;
Next_token.code = CHAR_TOKEN;
}
/*
* "get_backslash" extracts a character after a backslash
*
* returns: nothing
*/
int get_backslash(int i)
{
switch (Ch)
{
case '\"': // embedded quote?
case '\'':
case '\\': // embedded back slash?
break;
case 'n': // new line (cr/lf)?
case 'N':
Next_token.lexeme[i] = CR_CHAR;
i++;
Ch = LF_CHAR;
break;
case 't': // tab?
case 'T':
Ch = TAB_CHAR;
break;
case 'f': // form feed?
case 'F':
Ch = FF_CHAR;
break;
case 'b': // back space?
case 'B':
Ch = BS_CHAR;
break;
case '0': // null?
Ch = NULL;
break;
default:
compile_error(M_SYNTAX, M_ERROR, M_0);
Ch = NULL; // terminate string
break;
}
return i;
}
/*
* "get_special" extracts the tokens corresponding to punctuation
* characters.
*
* returns: nothing
*/
void get_special()
{
Next_token.lexeme[0] = Ch; // add to token string
int i = 1;
switch (Ch)
{
case ',':
Next_token.code = COMMA;
get_char();
break;
case '.':
get_char(); // period or double dot?
if (Ch == '.')
{
Next_token.lexeme[i] = Ch;
i++;
get_char();
if (Ch == '.')
{
Next_token.lexeme[i] = Ch;
i++;
Next_token.code = ELLIPSES;
get_char();
}
else
Next_token.code = ERR_TOKEN;
}
else
Next_token.code = PERIOD;
break;
case ':':
get_char(); // colon or assign?
if (Ch == '=')
{
Next_token.lexeme[i] = Ch;
i++;
Next_token.code = ASSIGN;
get_char();
}
else
Next_token.code = COLON;
break;
case '+':
Next_token.code = PLUS;
get_char();
break;
case '-':
Next_token.code = MINUS;
get_char();
break;
case '*':
Next_token.code = STAR;
get_char();
break;
case '/':
Next_token.code = SLASH;
get_char();
break;
case '^':
Next_token.code = CARET;
get_char();
break;
case '&':
Next_token.code = AMPERSAND;
get_char();
break;
case '=':
Next_token.code = EQ;
get_char();
break;
case '<':
get_char(); // lt or le?
if (Ch == '=')
{
Next_token.lexeme[i] = Ch;
i++;
Next_token.code = LE;
get_char();
}
else
Next_token.code = LT;
break;
case '>':
get_char(); // gt or ge?
if (Ch == '=')
{
Next_token.lexeme[i] = Ch;
i++;
Next_token.code = GE;
get_char();
}
else
Next_token.code = GT;
break;
case '~':
get_char(); // ne or error?
if (Ch == '=')
{
Next_token.lexeme[i] = Ch;
i++;
Next_token.code = NE;
get_char();
}
else
Next_token.code = ERR_TOKEN;
break;
case '(':
Next_token.code = L_PAREN;
get_char();
break;
case ')':
Next_token.code = R_PAREN;
get_char();
break;
default:
Next_token.code = ERR_TOKEN;
get_char();
break;
}
Next_token.lexeme[i] = NULL; // null terminate
}
/*
* "token_in" tests the current token for membership in a
* token list
*
* returns: true if "Token.code" is in the list, else false
*/
bool token_in(TOKEN_CODE token_list[]) // a list of tokens
{
TOKEN_CODE *tokenp;
if (token_list == NULL) // no list to match
return false;
for (tokenp = &token_list[0]; *tokenp; tokenp++)
{
if (Token.code == *tokenp) // found match
return true;
}
return false; // no match
}
/*
* "synchronize" skips tokens until a token is found
* which is in one of the lists.
*
* returns: nothing
*/
void synchronize(TOKEN_CODE list1[], // 1st list of tokens
TOKEN_CODE list2[], // 2nd list of tokens
TOKEN_CODE list3[]) // 3rd list of tokens
{
while ((!token_in(list1))&&
(!token_in(list2))&&
(!token_in(list3))&&
(Token.code != END_OF_FILE))
get_token(); // next in file
}
/*
* "get_source_line" reads the next line from the source file.
*
* returns: true if another source line is available
* false otherwise
*/
bool get_source_line()
{
char *more; // NULL at end of file
more = fgets(Buffer[File_number],
MAX_STRING_LENGTH,
Source[File_number]);
if (more)
{
Line_count++;
Line_number[File_number]++;
return true;
}
else
return false;
}
/*
* "include_file" opens a nested include file and starts processing
*
* returns: nothing
*/
void include_file()
{
char buffer[TOKEN_STRING_LENGTH]; // buffer for file name
while (Ch == ' ') // skip white space
get_char();
bool ext = false; // flag for extension
if (isalpha(Ch))
{
int length = 0; // length of file name
while (Ch != ' ')
{
buffer[length] = Ch;
if (Ch == '.')
ext = true;
get_char();
length++;
}
buffer[length] = NULL;
if (ext == false)
{
strcat(buffer, ".t");
}
File_number++;
if (open_source_file(buffer))
{
// initialize line number and prime get_char
Line_number[File_number] = 0;
Buffer_ptr[File_number] = Buffer[File_number];
Buffer_ptr[File_number] = "";
get_char();
}
else
{
File_number--;
Next_token.code = ERR_TOKEN;
}
}
else
Next_token.code = ERR_TOKEN;
}
/*
* "delete_file" closes the source file and decrements the
* file number.
*
* returns: nothing
*/
void delete_file()
{
if (File_number > 0)
{
fclose(Source[File_number]);
File_number--; // to previous file
Ch = ' '; // restore Ch
}
}
/*
* "if_code_get_token_else_error" gets the next token if the
* argument is the current token;
* otherwise a compiler error is generated.
*
* returns: nothing
*/
void if_code_get_token_else_error(TOKEN_CODE code) // of current token
{
MSG_CODE msg;
switch (code)
{
case ASSIGN: msg = M_ASSIGN; break;
case COLON: msg = M_COLON; break;
case COMMA: msg = M_COMMA; break;
case ELLIPSES: msg = M_ELLIPSES; break;
case L_PAREN: msg = M_LPAREN; break;
case R_PAREN: msg = M_RPAREN; break;
case DO: msg = M_DO; break;
case END: msg = M_END; break;
case OF: msg = M_OF; break;
case RECORD: msg = M_RECORD; break;
case THEN: msg = M_THEN; break;
case UNION: msg = M_UNION; break;
default: assert(0); break;
}
if (Token.code == code)
get_token();
else
compile_error(M_MISSING, msg, M_0);
}
| home
| contents
| previous
| next page
| send comment
| send link
| add bookmark |
Copyright © 2004, Stephen R. Schmitt