The bounding box (x,y,x1,y1) is represented below as (left,top,left1,top1). Middle is the mid-point between left and left1 and left_diff is the gap between current rows starting x position (left) and previous rows finishing x1 position (left1.shift()). Width is the left to left1 size.
top top1 left middle left1 left_diff width
0 7378.0 141126 7654 28262.0 489 70.0 NaN 16.0
1 7378.0 95123 71 614 66794.0 721 118.0 1.0 47.0
2 9578.0 117126 614125 683136.0 753147.0 7.0 22.0
3 11878.0 140 123 614 147 668 215.0 722283.0 0.0 136.0
4 140167.0 162199 61454 715 130.0 816206.0 -229.0 152.0
5 163167.0 185187 614664 629701.0 645739.0 458.0 75.0
6 254186.0 272204 76664 722.0 118 780.0 160-75.0 116.0
7 254202.0 272220 614664 638751.0 662838.0 -116.0 174.0
8 279212.0 298234 61454 703 347.0 792641.0 -784.0 587.0
9 294212.0 315237 76664 737.0 811.0 76 23.0 77147.0
10 294232.0 315254 7754 296347.0 516641.0 -757.0 587.0
11 294232.0 321253 614664 710701.0 806738.0 23.0 74.0
12 313232.0 334253 76826 839.0 167853.0 25988.0 27.0
13 326253.0 345275 61454 703 137.0 792220.0 -799.0 166.0
14 341268.0 361286 76664 717.0 147770.0 219444.0 106.0
15 350285.0 369310 61454 698 347.0 783641.0 -716.0 587.0
16 373285.0 392303 614664 715759.0 817855.0 23.0 191.0
17 383301.0 404330 7654 76347.0 641.0 77 -801.0 587.0
18 383301.0 404319 77664 684.0 276 704.0 47623.0 40.0
19 397301.0 416319 614826 713839.0 812853.0 122.0 27.0
20 410328.0 430350 7654 158347.0 241641.0 -799.0 587.0
....... etc......
import itertools
def pairwise(splits):
"s -> (s0,s1), (s1,s2), (s2, s3), ..."
a, b = itertools.tee(splits, 2)
next(b, None)
return list(zip(a, b))
def space_sort(df):
groups = df.loc[(df_coord.table==False)].groupby('page')
pages = {i:j[['top','top1','left','middle','left1']] for i,j in groups}
cols = ['left','middle','left1']
boxes = {}
for page in pages:
rows = {}
c_df = pages[page]
min_x = min(c_df.left)
gaps = c_df.loc[df.left_diff>5]
# value count on left, middle and left1 values so we can deal with text justification.
counts = {'left':[], 'middle':[], 'left1':[]}
[counts[col].append(gaps[col].unique()) for col in cols if (gaps[col].value_counts()>2).any()]
if len(counts['left'])>0:
counts['left'][0] = np.insert(counts['left'][0], 0, int(min_x))
# search c_df for other points close to these x values.
for col in cols:
if len(counts[col])>0:
for x in counts[col][0]:
row_spaces = {}
matches = c_df.loc[np.isclose(c_df[col],x, atol=5)]
left_groups = df_coord.loc[matches.index.values].reset_index()
# find points where line diff > 5 indicating new row. Get indexes.
vert_gaps = left_groups.loc[(left_groups.top - left_groups.top1.shift())>5]
vert_indexes = vert_gaps.index.values
vert_indexes = np.insert(vert_indexes,0,0)
vert_indexes = np.append(vert_indexes,len(left_groups))
# form groups between rows.
pairs = pairwise(vert_indexes)
for start,end in pairs:
box = left_groups.loc[start:end-1]
coords = (page, min(box.top),min(box.left),max(box.top1),max(box.left1))
boxes[coords]=(list(left_groups.loc[start:end-1,('index')]))
# Find close boxes by seeing which align on the same x value (either top, centre or bottom)
table = []
for a, b in itertools.combinations(boxes, 2):
a_pg, a_top, a_left, a_top1, a_left1 = a
b_pg, b_top, b_left, b_top1, b_left1 = b
a_centre = (a_top+a_top1)//2
b_centre = (b_top+b_top1)//2
if (np.isclose(a_top, b_top, atol=5)) | (np.isclose(a_centre, b_centre, atol=5)) | (np.isclose(a_top1, b_top1, atol=5)):
table.append([boxes[a],boxes[b]])
# Table list contains two lists of indexes of rows which are close together.
# As ordered, the indexes should be sequential.
# If difference between one pair and next is 1, sequential. If not, reset rows
t = (pairwise(table))
row = 0
for i in t:
if (i[1][0][-1] - i[0][1][-1]) == 1:
for r in i:
row+=1
num = 1
for col in r:
print('indexes', col, 'row',row, 'col',num)
num+=1
else:
row = 0