There are a few performance tricks we can apply here:
add __slots__ to the class definition should help with memory and performance as well:
class _RegExLib:
"""Set up regular expressions"""
# use https://regexper.com to visualise these if required
_reg_school = re.compile(r'School = (.*)\n')
_reg_grade = re.compile(r'Grade = (.*)\n')
_reg_name_score = re.compile(r'(Name|Score)')
__slots__ = ['school', 'grade', 'name_score']
def __init__(self, line):
# check whether line has a positive match with all of the regular expressions
self.school = self._reg_school.match(line)
self.grade = self._reg_grade.match(line)
self.name_score = self._reg_name_score.search(line)
using next() instead of .readline() should be faster as it uses a lookahead buffer internally:
with open(filepath, 'r') as file:
line = next(file)
while line:
reg_match = _RegExLib(line)
if reg_match.school:
school = reg_match.school.group(1)
if reg_match.grade:
grade = reg_match.grade.group(1)
grade = int(grade)
if reg_match.name_score:
value_type = reg_match.name_score.group(1)
line = next(file, None)
while line and line.strip():
number, value = line.strip().split(',')
value = value.strip()
dict_of_data = {
'School': school,
'Grade': grade,
'Student number': number,
value_type: value
}
data.append(dict_of_data)
line = next(file, None)
line = next(file, None)
Some of the code style and other notes:
file is a reserved keyword, think of a different variable name
- define your regular expression strings as raw strings
- you can probably replace
.* wildcard with a more concrete \d+ for the "grade" regex: Grade = (\d+)\n