# 使用python编写一个最基础的代码解释器的要点解析

veca = [1, 2, 3] # 列表声明
vecb = [4, 5, 6]
print ‘veca:’, veca # 打印字符串、列表，print expr+
print ‘veca * 2:’, veca * 2 # 列表与整数乘法
print ‘veca + 2:’, veca + 2 # 列表与整数加法
print ‘veca + vecb:’, veca + vecb # 列表加法
print ‘veca + [11, 12]:’, veca + [11, 12]
print ‘veca * vecb:’, veca * vecb # 列表乘法
print ‘veca:’, veca
print ‘vecb:’, vecb

veca: [1, 2, 3]
veca * 2: [2, 4, 6]
veca + 2: [1, 2, 3, 2]
veca + vecb: [1, 2, 3, 2, 4, 5, 6]
veca + [11, 12]: [1, 2, 3, 2, 11, 12]
veca * vecb: [4, 5, 6, 8, 10, 12, 12, 15, 18, 8, 10, 12]
veca: [1, 2, 3, 2]
vecb: [4, 5, 6]

i love you.

ll(1)中两个l都是left-to-right的意思，第一个l表示解析器按从左到右的顺序解析输入内容，第二个l表示在下降过程中，也是按照从左到右的顺序遍历子节点，而1表示根据1个向前看单元做预测。

statlist: stat+
stat: id ‘=’ expr
| ‘print’ expr (, expr)*
expr: multipart (‘+’ multipart)*
| str
multipart: primary (‘*’ primary)*
primary: int
| id
| ‘[‘ expr (‘,’, expr)* ‘]’
int: (1..9)(0..9)*
id: (a..z | a..z)*
str: (\”.*\”) | (\’.*\’)

””’
a simple lexer of a small vector language.
statlist: stat+
stat: id ‘=’ expr
| ‘print’ expr (, expr)*
expr: multipart (‘+’ multipart)*
| str
multipart: primary (‘*’ primary)*
primary: int
| id
| ‘[‘ expr (‘,’, expr)* ‘]’
int: (1..9)(0..9)*
id: (a..z | a..z)*
str: (\”.*\”) | (\’.*\’)
created on 2012-9-26
@author: bjzllou
”’
eof = -1
# token type
comma = ‘comma’
equal = ‘equal’
lbrack = ‘lbrack’
rbrack = ‘rbrack’
times = ‘times’
print = ‘print’
id = ‘id’
int = ‘int’
str = ‘str’
class veclexer:
””’
ll(1) lexer.
it uses only one lookahead char to determine which is next token.
for each non-terminal token, it has a rule to handle it.
ll(1) is a quit weak parser, it isn’t appropriate for the grammar which is
left-recursive or ambiguity. for example, the rule ‘t: t r’ is left recursive.
however, it’s rather simple and has high performance, and fits simple grammar.
”’
def __init__(self, input):
self.input = input
# current index of the input stream.
self.idx = 1
self.cur_c = input[0]
def next_token(self):
while self.cur_c != eof:
c = self.cur_c
if c.isspace():
self.consume()
elif c == ‘[‘:
self.consume()
return (lbrack, c)
elif c == ‘]’:
self.consume()
return (rbrack, c)
elif c == ‘,’:
self.consume()
return (comma, c)
elif c == ‘=’:
self.consume()
return (equal, c)
elif c == ‘*’:
self.consume()
return (times, c)
elif c == ‘+’:
self.consume()
elif c == ‘\” or c == ‘”‘:
return self._string()
elif c.isdigit():
return self._int()
elif c.isalpha():
t = self._print()
return t if t else self._id()
else:
raise exception(‘not support token’)
return (eof, ‘eof’)
def has_next(self):
return self.cur_c != eof
def _id(self):
n = self.cur_c
self.consume()
while self.cur_c.isalpha():
n += self.cur_c
self.consume()
return (id, n)
def _int(self):
n = self.cur_c
self.consume()
while self.cur_c.isdigit():
n += self.cur_c
self.consume()
return (int, int(n))
def _print(self):
n = self.input[self.idx – 1 : self.idx + 4]
if n == ‘print’:
self.idx += 4
self.cur_c = self.input[self.idx]
return (print, n)
return none
def _string(self):
quotes_type = self.cur_c
self.consume()
s = ”
while self.cur_c != ‘\n’ and self.cur_c != quotes_type:
s += self.cur_c
self.consume()
if self.cur_c != quotes_type:
raise exception(‘string quotes is not matched. excepted %s’ % quotes_type)
self.consume()
return (str, s)
def consume(self):
if self.idx >= len(self.input):
self.cur_c = eof
return
self.cur_c = self.input[self.idx]
self.idx += 1
if __name__ == ‘__main__’:
exp = ””’
veca = [1, 2, 3]
print ‘veca:’, veca
print ‘veca * 2:’, veca * 2
print ‘veca + 2:’, veca + 2
”’
lex = veclexer(exp)
t = lex.next_token()
while t[0] != eof:
print t
t = lex.next_token()

veca = [1, 2, 3]
print ‘veca:’, veca
print ‘veca * 2:’, veca * 2
print ‘veca + 2:’, veca + 2

(‘id’, ‘veca’)
(‘equal’, ‘=’)
(‘lbrack’, ‘[‘)
(‘int’, 1)
(‘comma’, ‘,’)
(‘int’, 2)
(‘comma’, ‘,’)
(‘int’, 3)
(‘rbrack’, ‘]’)
(‘print’, ‘print’)
(‘str’, ‘veca:’)
(‘comma’, ‘,’)
(‘id’, ‘veca’)
(‘print’, ‘print’)
(‘str’, ‘veca * 2:’)
(‘comma’, ‘,’)
(‘id’, ‘veca’)
(‘times’, ‘*’)
(‘int’, 2)
(‘print’, ‘print’)
(‘str’, ‘veca + 2:’)
(‘comma’, ‘,’)
(‘id’, ‘veca’)
(‘int’, 2)

””’
a simple parser of a small vector language.
statlist: stat+
stat: id ‘=’ expr
| ‘print’ expr (, expr)*
expr: multipart (‘+’ multipart)*
| str
multipart: primary (‘*’ primary)*
primary: int
| id
| ‘[‘ expr (‘,’, expr)* ‘]’
int: (1..9)(0..9)*
id: (a..z | a..z)*
str: (\”.*\”) | (\’.*\’)
example:
veca = [1, 2, 3]
vecb = veca + 4 # vecb: [1, 2, 3, 4]
vecc = veca * 3 # vecc:
created on 2012-9-26
@author: bjzllou
”’
import veclexer
class vecparser:
””’
ll(1) parser.
”’
def __init__(self, lexer):
self.lexer = lexer
# lookahead token. based on the lookahead token to choose the parse option.
self.cur_token = lexer.next_token()
# similar to symbol table, here it’s only used to store variables’ value
self.symtab = {}
def statlist(self):
while self.lexer.has_next():
self.stat()
def stat(self):
token_type, token_val = self.cur_token
# asignment
if token_type == veclexer.id:
self.consume()
# for the terminal token, it only needs to match and consume.
# if it’s not matched, it means that is a syntax error.
self.match(veclexer.equal)
# store the value to symbol table.
self.symtab[token_val] = self.expr()
# print statement
elif token_type == veclexer.print:
self.consume()
v = str(self.expr())
while self.cur_token[0] == veclexer.comma:
self.match(veclexer.comma)
v += ‘ ‘ + str(self.expr())
print v
else:
raise exception(‘not support token %s’, token_type)
def expr(self):
token_type, token_val = self.cur_token
if token_type == veclexer.str:
self.consume()
else:
v = self.multipart()
self.consume()
v1 = self.multipart()
if type(v1) == int:
v.append(v1)
elif type(v1) == list:
v = v + v1
return v
def multipart(self):
v = self.primary()
while self.cur_token[0] == veclexer.times:
self.consume()
v1 = self.primary()
if type(v1) == int:
v = [x*v1 for x in v]
elif type(v1) == list:
v = [x*y for x in v for y in v1]
return v
def primary(self):
token_type = self.cur_token[0]
token_val = self.cur_token[1]
# int
if token_type == veclexer.int:
self.consume()
# variables reference
elif token_type == veclexer.id:
self.consume()
if token_val in self.symtab:
return self.symtab[token_val]
else:
raise exception(‘undefined variable %s’ % token_val)
# parse list
elif token_type == veclexer.lbrack:
self.match(veclexer.lbrack)
v = [self.expr()]
while self.cur_token[0] == veclexer.comma:
self.match(veclexer.comma)
v.append(self.expr())
self.match(veclexer.rbrack)
return v
def consume(self):
self.cur_token = self.lexer.next_token()
def match(self, token_type):
if self.cur_token[0] == token_type:
self.consume()
return true
raise exception(‘expecting %s; found %s’ % (token_type, self.cur_token[0]))
if __name__ == ‘__main__’:
prog = ””’
veca = [1, 2, 3]
vecb = [4, 5, 6]
print ‘veca:’, veca
print ‘veca * 2:’, veca * 2
print ‘veca + 2:’, veca + 2
print ‘veca + vecb:’, veca + vecb
print ‘veca + [11, 12]:’, veca + [11, 12]
print ‘veca * vecb:’, veca * vecb
print ‘veca:’, veca
print ‘vecb:’, vecb
”’
lex = veclexer.veclexer(prog)
parser = vecparser(lex)
parser.statlist()

Posted in 未分类