I've been working on a general lexer for a programming language for a couple days now. I don't know if the code I've written is overcomplicated, if there is a better way to parse the code into tokens or if my code could be made easier with a library.
Code:
from enum import Enum
from typing import Any
OPERATORS = list("+~-*/^&<>()[]{}=!?|,")
# TODO: add double and triple-char operator support
#DOUBLECHAROPERATORS = ["|>", "<|", "!=", "==", "<<", ">>"]
#TRIPLECHAROPERATORS = [""]
NAMECHARS = list('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ._')
STRCHARS = [chr(i) for i in range(0,256)]
NUMS = list('1234567890.')
LINEFEEDS = list(';\n')
CMTLINEFEEDS = ['\n']
SLICOMMENTS = ['#', '//']
MLICOMMENTS = [['\\*', '*/']]
STRINGCHARS = list('\'\"')
[STRCHARS.remove(i) for i in STRINGCHARS + ['\n']]
class TokenTypes(Enum):
SPECIAL = -1
OP = 0
NAME = 1
COMMENT = 2
LINEFEED = 3
STRING = 4
NUMBER = 5
class Token:
def __init__(self, type: int, data: Any | str):
self.type = type
self.data = data
def __repr__(self):
return f"Token({self.type}, {repr(self.data)})"
class Modes:
class ModeBool():
boolean = False
def __init__(self, val: bool = False):
self.boolean = val
def __neg__(self):
self.boolean = False
def __pos__(self):
self.boolean = True
def __inv__(self):
self.boolean = not self.boolean
def __bool__(self):
return self.boolean
def __repr__(self):
return str(self.boolean)
REGULAR = ModeBool(True)
STRING = ModeBool(False)
COMMENT = ModeBool(False)
COMMENTML = ModeBool(False)
NAME = ModeBool(False)
NUM = ModeBool(False)
def __repr__(self):
return str({i:getattr(self, i) for i in dir(self) if not i.startswith('_') and i != 'ModeBool'})
class Lexer:
def __init__(self, src: str) -> None:
self.position = 0
self.src = src
self.tokens = []
self.modes = Modes()
self.currdata = {}
def parse(self) -> None:
self._main_parse_loop()
def _main_parse_loop(self) -> None:
while True:
if self.position == len(self.src):
return
char = self.src[self.position]
if not len(self.src) == self.position + 1:
nxtchar = self.src[self.position + 1]
if self.modes.REGULAR:
print(char, nxtchar)
if char == '/' and nxtchar == '*':
+self.modes.COMMENTML
-self.modes.REGULAR
self.position+=1
elif char in OPERATORS:
self.tokens.append(Token(TokenTypes.OP, char))
elif char in LINEFEEDS:
self.tokens.append(Token(TokenTypes.LINEFEED, char))
elif char in SLICOMMENTS:
+self.modes.COMMENT
-self.modes.REGULAR
self.currdata = {}
self.position-=1
elif char in NAMECHARS:
-self.modes.REGULAR
+self.modes.NAME
self.position-=1
elif char in STRINGCHARS:
-self.modes.REGULAR
+self.modes.STRING
self.position-=1
elif char in NUMS:
-self.modes.REGULAR
+self.modes.NUM
self.position-=1
elif self.modes.COMMENT:
if not self.currdata.get('notfirst'):
self.currdata['notfirst'] = 1
self.currdata['str'] = ''
if char in LINEFEEDS:
self.tokens.append(Token(TokenTypes.COMMENT, self.currdata['str']))
self.currdata = {}
-self.modes.COMMENT
+self.modes.REGULAR
self.position-=1
else:
self.currdata['str'] += char
elif self.modes.COMMENTML:
if not self.currdata.get('notfirst'):
self.currdata['notfirst'] = 1
self.currdata['str'] = '/*'
if char == '*' and nxtchar == '/':
self.currdata['str'] += '*/'
self.tokens.append(Token(TokenTypes.COMMENT, self.currdata['str']))
self.currdata = {}
-self.modes.COMMENTML
+self.modes.REGULAR
self.position+=1
else:
self.currdata['str'] += char
elif self.modes.NAME:
if not self.currdata.get('notfirst'):
self.currdata['notfirst'] = 1
self.currdata['str'] = ''
if char in NAMECHARS:
self.currdata['str'] += char
if char in LINEFEEDS or char in OPERATORS or char == ' ':
print('a') if self.currdata['str'] == '' else None
self.tokens.append(Token(TokenTypes.NAME, self.currdata['str']))
self.currdata = {}
-self.modes.NAME
+self.modes.REGULAR
if not char == ' ':
self.position-=1
elif self.modes.STRING:
if not self.currdata.get('notfirst'):
self.currdata['notfirst'] = 1
self.currdata['str'] = char
elif char in STRCHARS or char == '\n':
self.currdata['str'] += char
elif char in STRINGCHARS:
self.currdata['str'] += char
self.tokens.append(Token(TokenTypes.STRING, self.currdata['str']))
self.currdata = {}
-self.modes.STRING
+self.modes.REGULAR
#self.position-=1
elif self.modes.NUM:
if not self.currdata.get('notfirst'):
self.currdata['notfirst'] = 1
self.currdata['str'] = ''
if char in NUMS:
self.currdata['str'] += char
if char in LINEFEEDS or char in OPERATORS:
print('a') if self.currdata['str'] == '' else None
self.tokens.append(Token(TokenTypes.NUMBER, self.currdata['str']))
self.currdata = {}
-self.modes.NUM
+self.modes.REGULAR
self.position-=1
self.position += 1
print(self.position, self.modes, char)
if __name__ == '__main__':
code = """
import github.octocat.gopher.libgopher as go
if go.enabled != true {exit()} # Very much a test ----+-
compile_hello = () -> {go.compile('Hello, World!')}
print(compile_hello())
compile_hello() |> print
"VeryWell1000000" |> print
for i in range(0,5) {
print(i)
}
/*
this
should
not
be parsed *-*
*/
20 & 30 | 50
a = "
never
gonna
give
you
up
"
b = "never gonna let you down"; rick = a + b
/*
regex = r'.*'
*/
if not b + 1 == 2 {print('a')}
"""
parser: Lexer = Lexer(code)
parser.parse()
[print(i) for i in parser.tokens]
print(' '.join([i.data for i in parser.tokens]))
print(code)