用C语言写解释器（四）——语句分析

在前面的章节中已经成功实现了内存管理和表达式求值模块。之所以称表达式求值是解释器的核心部分，是因为几乎所有语句的操作都伴随着表达式求值。也许你已经迫不及待地给 eval 传值让它执行复杂的运输了，但目前来讲它充其量只是一个计算器。要想成为一门语言，还需要一套自成体系的语法，包括输入输出语句和控制语句。但在进行语法分析之前，首先需要将 basic 源码载入到内存中。

// in basic_io.h

#define program_size (10000)

typedef struct {

int ln; // line number

string line;

} code;

extern code code[program_size];

extern int cp;

extern int code_size;

其中 code_size 的作用顾名思义：记录代码的行数。cp （0 ≤ cp < code_size）记录当前行的下标（比如 cp 等于5时表明执行到第5行）。下面是载入 basic 源码的参考代码，在载入源码的同时会去除两端的空白字符。

// in basic_io.c

void load_program ( string filename )

{

file *fp = fopen ( filename, "r" );

int bg, ed;

if ( fp == null ) {

fprintf ( stderr, "文件 %s 无法打开！/n", filename );

exit ( exit_failure );

}

while ( fscanf ( fp, "%d", &code[cp].ln ) != eof ) {

if ( code[cp].ln <= code[cp-1].ln ) {

fprintf ( stderr, "line %d: 标号错误！/n", cp );

fgets ( code[cp].line, sizeof(code[cp].line), fp );

for ( bg = 0; isspace(code[cp].line[bg]); bg++ );

ed = (int)strlen ( code[cp].line + bg ) - 1;

while ( ed >= 0 && isspace ( code[cp].line[ed+bg] ) ) {

ed--;

if ( ed >= 0 ) {

memmove ( code[cp].line, code[cp].line + bg, ed + 1 );

code[cp].line[ed + 1] = 0;

} else {

code[cp].line[0] = 0;

cp++;

if ( cp >= program_size ) {

fprintf ( stderr, "程序%s太大，代码空间不足！/n", filename );

code_size = cp;

cp = 1;

源码载入完成后就要开始逐行分析语句了，程序中总共能处理以下 11 种语句：

// in main.c

typedef enum {

key_input = 0, // input

key_print, // print

key_for, // for .. to .. step

key_next, // next

key_while, // while

key_wend, // wend

key_if, // if

key_else, // else

key_endif, // end if

key_goto, // goto

key_let // let

} keywords;

keywords yacc ( const string line )

if ( !strnicmp ( line, "input ", 6 ) ) {

return key_input;

} else if ( !strnicmp ( line, "print ", 6 ) ) {

return key_print;

} else if ( !strnicmp ( line, "for ", 4 ) ) {

return key_for;

} else if ( !stricmp ( line, "next" ) ) {

return key_next;

} else if ( !strnicmp ( line, "while ", 6 ) ) {

return key_while;

} else if ( !stricmp ( line, "wend" ) ) {

return key_wend;

} else if ( !strnicmp ( line, "if ", 3 ) ) {

return key_if;

} else if ( !stricmp ( line, "else" ) ) {

return key_else;

} else if ( !stricmp ( line, "end if" ) ) {

return key_endif;

} else if ( !strnicmp ( line, "goto ", 5 ) ) {

return key_goto;

} else if ( !strnicmp ( line, "let ", 4 ) ) {

return key_let;

} else if ( strchr ( line, '=' ) ) {

return -1;

每个语句对应有一个执行函数，在分析出是哪种语句后，就可以调用它了！为了编码方便，我们将这些执行函数保存在一个函数指针数组中，请看下面的参考代码：

void (*key_func[])( const string ) = {

exec_input,

exec_print,

exec_for,

exec_next,

exec_while,

exec_wend,

exec_if,

exec_else,

exec_endif,

exec_goto,

exec_assignment

};

int main ( int argc, char *argv[] )

if ( argc != 2 ) {

fprintf ( stderr, "usage: %s basic_script_file/n", argv[0] );

load_program ( argv[1] );

while ( cp < code_size ) {

(*key_func[yacc ( code[cp].line )]) ( code[cp].line );

return exit_success;

以上代码展示的就是整个程序的基础框架，现在欠缺的只是每个语句的执行函数，下面将逐个详细解释。

输入输出是一个宽泛的概念，并不局限于从键盘输入和显示到屏幕上，还包括操作文件、连接网络、进程通信等。《我们的目标》中指出只需实现从键盘输入（input）和显示到屏幕上（print），事实上还应该包括赋值语句，只不过它属于程序内部的i/o。

input 语句后面跟着一堆变量名（用逗号隔开）。因为变量是弱类型，你可以输入数字或字符串。但c语言是强类型语言，为实现这个功能就需要判断一下 scanf 的返回值。我们执行 scanf ( "%lf", &memory[n].i )，如果你输入的是一个数字，就能成功读取一个浮点数，函数返回 1、否则就返回 0；不能读取时就采用 getchar 来获取字符串！参考代码如下：

void exec_input ( const string line )

const char *s = line;

int n;

assert ( s != null );

s += 5;

while ( *s ) {

while ( *s && isspace(*s) ) {

s++;

if ( !isalpha(*s) || isalnum(*(s+1)) ) {

perror ( "变量名错误！/n" );

n = toupper(*s) - 'a';

if ( !scanf ( "%lf", &memory[n].i ) ) {

int i;

// 用户输入的是一个字符串

memory[n].type = var_string;

if ( (memory[n].s[0] = getchar()) == '"' ) {

for ( i = 0; (memory[n].s[i]=getchar())!='"'; i++ );

for ( i = 1; !isspace(memory[n].s[i]=getchar()); i++ );

memory[n].s[i] = 0;

memory[n].type = var_double;

do {

} while ( *s && isspace(*s) );

if ( *s && *s != ',' ) {

perror ( "input 表达式语法错误！/n" );

} else if ( *s ) {

输出相对简单些，print 后面跟随的是一堆表达式，表达式只需委托给 eval 来求值即可，因此 print 要做的仅仅是按照值的类型来输出结果。唯一需要小心的就是类似 print "hello, world" 这样字符串中带有逗号的情况，以下是参考代码：

void exec_print ( const string line )

string l;

char *s, *e;

variant v;

int c = 0;

strcpy ( l, line );

s = l;

for (;;) {

for ( e = s; *e && *e != ','; e++ ) {

// 去除字符串

if ( *e == '"' ) {

e++;

} while ( *e && *e != '"' );

if ( *e ) {

*e = 0;

e = null;

if ( c++ ) putchar ( '/t' );

v = eval ( s );

if ( v.type == var_double ) {

printf ( "%g", v.i );

} else if ( v.type == var_string ) {

printf ( v.s );

if ( e ) {

s = e + 1;

putchar ( '/n' );

break;

在 basic 中，“赋值”和“等号”都使用“=”，因此不能像 c 语言中使用 a = b = c 这样连续赋值，在 basic 中它的意思是判断 b 和 c 的值是否相等并将结果赋值给 a 。而且关键字 let 是可选的，即 let a = 1 和 a = 1 是等价的。剩下的事情那个就很简单了，只要将表达式的值赋给变量即可。以下是参考代码：

void exec_assignment ( const string line )

if ( !strnicmp ( s, "let ", 4 ) ) {

s += 4;

if ( *s != '=' ) {

fprintf ( stderr, "赋值表达式 %s 语法错误！/n", line );

memory[n] = eval ( s + 1 );

现在是最后一个模块——控制语句。控制语句并不参与交互，它们的作用只是根据一定的规则来改变代码指针（cp）的值，让程序能到指定的位置去继续执行。限于篇幅，本节只介绍 for、next 以及 goto 三个控制语句的实现方法，读者可以尝试自己完成其他函数，也可以参看附带的完整代码。

先来看一下 for 语句的结构：

for var = expression1 to expression2 [step expression3]

它首先要计算三个表达式，获得 v1、v2、v3 三个值，然后让变量（var）从 v1 开始，每次迭代都加 v3，直到超出 v2 的范围位置。因此，每一个 for 语句，我们都需要保存这四个信息：变量名、起始值、结束值以及步长。另外，不要忘记 for 循环等控制语句可以嵌套使用，因此需要开辟一组空间来保存这些信息，参考代码如下：

// in grammar.h

static struct {

int id; // memory index

int ln; // line number

double target; // target value

double step;

} stack_for[memory_size];

static int top_for = -1;

分析的过程就是通过 strstr 在语句中搜索“=”、“to”、“step”等字符串，然后将提取的表达式传递给 eval 计算，并将值保存到 stack_for 这个空间中。参考代码如下：

// in grammar.c

void exec_for ( const string line )

char *s, *t;

int top = top_for + 1;

if ( strnicmp ( line, "for ", 4 ) ) {

goto errorhandler;

} else if ( top >= memory_size ) {

fprintf ( stderr, "for 循环嵌套过深！/n" );

s = l + 4;

while ( *s && isspace(*s) ) s++;

if ( isalpha(*s) && !isalnum(s[1]) ) {

stack_for[top].id = toupper(*s) - 'a';

stack_for[top].ln = cp;

if ( *s == '=' ) {

t = strstr ( s, " to " );

if ( t != null ) {

*t = 0;

memory[stack_for[top].id] = eval ( s );

s = t + 4;

t = strstr ( s, " step " );

stack_for[top].target = eval ( s ).i;

s = t + 5;

stack_for[top].step = eval ( s ).i;

if ( fabs ( stack_for[top].step ) < 1e-6 ) {

stack_for[top].step = 1;

if ( (stack_for[top].step > 0 &&

memory[stack_for[top].id].i > stack_for[top].target)||

(stack_for[top].step < 0 &&

memory[stack_for[top].id].i < stack_for[top].target)) {

while ( cp < code_size && strcmp(code[cp].line, "next") ) {

if ( cp >= code_size ) {

top_for++;

return;

errorhandler:

fprintf ( stderr, "line %d: 语法错误！/n", code[cp].ln );

next 的工作就简单得多了。它从 stack_for 这个空间中取出最后一组数据，让变量的值累加上步长，并判断循环是否结束。如果结束就跳出循环执行下一条语句；否则就将代码指针移回循环体的顶部，继续执行循环体。下面是参考代码。

void exec_next ( const string line )

if ( stricmp ( line, "next" ) ) {

if ( top_for < 0 ) {

fprintf ( stderr, "line %d: next 没有相匹配的 for！/n", code[cp].ln );

memory[stack_for[top_for].id].i += stack_for[top_for].step;

if ( stack_for[top_for].step > 0 &&

memory[stack_for[top_for].id].i > stack_for[top_for].target ) {

top_for--;

} else if ( stack_for[top_for].step < 0 &&

memory[stack_for[top_for].id].i < stack_for[top_for].target ) {

cp = stack_for[top_for].ln;

也许你认为 goto 语句只是简单的将 cp 的值设置为指定的行，但事实上它比想象中的要复杂些。考虑下面的 basic 代码：

像这类代码，直接跳到循环体内部，如果只是简单地将 cp 移动到指定位置，当代码继续执行到 next 时就会报告没有对应的 for 循环！跳到其他的控制结构，如 while、if 等，也会出现相同的问题。以下是参考代码（有删减）。

void exec_goto ( const string line )

int ln;

if ( strnicmp ( line, "goto ", 5 ) ) {

ln = (int)eval ( line + 5 ).i;

if ( ln > code[cp].ln ) {

// 往下跳转

while ( cp < code_size && ln != code[cp].ln ) {

if ( !strnicmp ( code[cp].line, "if ", 3 ) ) {

top_if++;

stack_if[top_if] = 1;

} else if ( !stricmp ( code[cp].line, "else" ) ) {

} else if ( !stricmp ( code[cp].line, "end if" ) ) {

top_if--;

} else if ( !strnicmp ( code[cp].line, "while ", 6 ) ) {

top_while++;

stack_while[top_while].isrun = 1;

stack_while[top_while].ln = cp;

} else if ( !stricmp ( code[cp].line, "wend" ) ) {

top_while--;

} else if ( !strnicmp ( code[cp].line, "for ", 4 ) ) {

int i = 4;

while ( isspace(code[cp].line[i]) ) i++;

v = memory[toupper(code[cp].line[i])-'a'];

exec_for ( code[cp].line );

memory[toupper(code[cp].line[i])-'a'] = v;

} else if ( !stricmp ( code[cp].line, "next" ) ) {

} else if ( ln < code[cp].ln ) {

// 往上跳转

// 代码类似，此处省略

// 我不希望出现死循环，你可能有其他处理方式

fprintf ( stderr, "line %d: 死循环！/n", code[cp].ln );

if ( ln == code[cp].ln ) {

cp--;

fprintf ( stderr, "标号 %d 不存在！/n", ln );

本章介绍了源码载入、语法分析以及部分语句的实现，while 和 if 等控制语句方法和 for、next 类似，有兴趣的读者请尝试自己实现（或者参看附带的完整源码）。这样一个解释器的四个关键部分“内存管理”、“表达式求值”、“输入输出”和“控制语句”就全部介绍完了，希望你也能写出自己的解释器。下一篇我将总结一下我个人对编程语言的一些思考，如果你也有兴趣请继续关注《用c语言写解释器（五）》！

用C语言写解释器（四）——语句分析

继续阅读

C语言-----初阶指针详解

输出蛇形矩阵 C语言输出蛇形矩阵

1.54寸TFT ST7789液晶屏图片如何取模

三子棋——年轻人的第一款小游戏前言正文总结

DOG（4）：解析器的部分实现细节先来说说parser一些可能迷惑的地方结果如何返回?pcd其实是一回事最后的一点说明

NYOJ 269--VF

二叉树三种遍历(先序，中序，后序)----超详细引入先序遍历后序遍历中序遍历总结

UVA 110 Meta-Loopless Sorts

CRC32和CRC8校验代码，C语言版

241 Different Ways to Add Parentheses（C代码版）

C语言：初学者必定看懂的注释！！！猴子吃桃问题。猴子第一天摘下若干个桃子，每天都吃了前一天剩下的一半零一个，到第10天早上想再吃的时候，就剩下一个桃子. 求第一天共摘多少个桃子。

[转]九大排序算法——C语言实现及详解

while 循环、do- while 循环和 for 循环之间的那点事C语言自学之三种循环比较

结构体：typedef与struct的区别

hdu7108哈希