You can adjust those entries by generating a set of rules for processing each row to follow, then building a pandas.DataFrame from the pre-processed rows.
Here is an example implementation with a couple rows from your example (the dates seem too poorly formatted to automate, but maybe you have additional insight):
import pandas
def proc_row(row: str) -> dict:
DEFAULT_UNIT_VALUE = '???'
out_dict = {}
raw_fields = row.split(',')
out_dict['index'] = int(raw_fields.pop(0))
out_dict['bnf_code'] = int(raw_fields.pop(0))
out_dict['date_prescribed'] = raw_fields.pop(0)
out_dict['prescribed_location'] = raw_fields.pop(0)
out_dict['dose'] = float(raw_fields.pop(0))
if raw_fields[0] in ['mg', 'ml']:
out_dict['unit'] = raw_fields.pop(0)
else:
out_dict['unit'] = DEFAULT_UNIT_VALUE
out_dict['colour'] = raw_fields.pop(0)
out_dict['shape'] = raw_fields.pop(0)
out_dict['max_temp_c'] = int(raw_fields.pop(0))
out_dict['needs_refri'] = bool(int(raw_fields.pop(0)))
out_dict['dose_frequency'] = raw_fields.pop(0)
if raw_fields:
raise ValueError('CSV contained too many columns!')
return out_dict
def _main():
file_data = ('Index,BNF_code,DatePrescribed,PrescribedLocation,Dose,Unit,Colour,Shape,MaxSafeTemp_C,NeedsRefrigeration,DoseFrequency\n'
'0,601,5012023,hosp,4,ml,cloudy,na,6,1,2x\n'
'1,1001,26072024,pharm,200,mg,orange,round,25,0,4x\n'
'2,501,12112023,notknown,500,white,oval,25,0,2x\n'
'')
flines = file_data.split('\n')
data = []
for line in flines[1:]:
if line:
data.append(proc_row(line))
pandas.set_option('display.max_columns', 500)
pandas.set_option('display.width', 500)
df = pandas.DataFrame(data)
print(df)
if __name__ == '__main__':
_main()
And here is the console output it produces which is well formatted:
index bnf_code date_prescribed prescribed_location dose unit colour shape max_temp_c needs_refri dose_frequency
0 0 601 5012023 hosp 4.0 ml cloudy na 6 True 2x
1 1 1001 26072024 pharm 200.0 mg orange round 25 False 4x
2 2 501 12112023 notknown 500.0 ??? white oval 25 False 2x
You can add as much additional logic necessary to address other rows that read incorrectly.
shiftinpandas? it is for moving rows up or down, not for moving columns left or right. And you could add tagpandasfor this.unithas correct value (or check if last column hasNone) and move cells to the right using another for-loop. Eventually you could get this row as standardlistand use.insert(position, value)(to add missing value) and later assign this list again in the same row.