Tweeted twitter.com/StackCodeReview/status/1283778446435520514

occurred Jul 16, 2020 at 15:00

Clarified question - dataframe columns explained in description

Source Link

edited Jul 16, 2020 at 14:57

141
3

The bounding box (x,y,x1,y1) is represented below as (left,top,left1,top1). Middle is the mid-point between left and left1 and left_diff is the gap between current rows starting x position (left) and previous rows finishing x1 position (left1.shift()). Width is the left to left1 size.

    top     top1    left    middle  left1   left_diff   width
0   7378.0    141126     7654      28262.0   489 70.0    NaN     16.0
1   7378.0    95123     71 614     66794.0   721 118.0   1.0     47.0
2   9578.0    117126     614125     683136.0   753147.0   7.0     22.0
3   11878.0   140 123    614 147    668 215.0   722283.0   0.0     136.0
4   140167.0   162199     61454     715 130.0   816206.0   -229.0  152.0
5   163167.0   185187     614664     629701.0   645739.0   458.0   75.0
6   254186.0   272204     76664     722.0 118  780.0   160-75.0   116.0
7   254202.0   272220     614664     638751.0   662838.0   -116.0  174.0
8   279212.0   298234     61454     703 347.0   792641.0   -784.0  587.0
9   294212.0   315237     76664     737.0   811.0  76 23.0    77147.0
10  294232.0   315254     7754      296347.0   516641.0   -757.0  587.0
11  294232.0   321253     614664     710701.0   806738.0   23.0    74.0
12  313232.0   334253     76826     839.0   167853.0   25988.0    27.0
13  326253.0   345275     61454     703 137.0   792220.0   -799.0  166.0
14  341268.0   361286     76664     717.0   147770.0   219444.0   106.0
15  350285.0   369310     61454     698 347.0   783641.0   -716.0  587.0
16  373285.0   392303     614664     715759.0   817855.0   23.0    191.0
17  383301.0   404330     7654      76347.0   641.0 77  -801.0  587.0
18  383301.0   404319     77664     684.0 276  704.0   47623.0    40.0
19  397301.0   416319     614826     713839.0   812853.0   122.0   27.0
20  410328.0   430350     7654      158347.0   241641.0   -799.0  587.0

....... etc......

import itertools

def pairwise(splits):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = itertools.tee(splits, 2)
    next(b, None)
    return list(zip(a, b))

def space_sort(df):
    groups = df.loc[(df_coord.table==False)].groupby('page')
    pages = {i:j[['top','top1','left','middle','left1']] for i,j in groups}
    cols = ['left','middle','left1']
    boxes = {}
    for page in pages:
        rows = {}
        c_df = pages[page]
        min_x = min(c_df.left)
        gaps = c_df.loc[df.left_diff>5]
        
        #  value count on left, middle and left1 values so we can deal with text justification.
        counts = {'left':[], 'middle':[], 'left1':[]}
        [counts[col].append(gaps[col].unique()) for col in cols if (gaps[col].value_counts()>2).any()]
        
        if len(counts['left'])>0:
            counts['left'][0] = np.insert(counts['left'][0], 0, int(min_x))

        #  search c_df for other points close to these x values.
        for col in cols:
            if len(counts[col])>0:
                for x in counts[col][0]:
                    row_spaces = {}
                    matches = c_df.loc[np.isclose(c_df[col],x, atol=5)]
                    left_groups = df_coord.loc[matches.index.values].reset_index()
                    
#           find points where line diff > 5 indicating new row. Get indexes.
                    vert_gaps = left_groups.loc[(left_groups.top - left_groups.top1.shift())>5]                    
                    vert_indexes = vert_gaps.index.values
                    vert_indexes = np.insert(vert_indexes,0,0)
                    vert_indexes = np.append(vert_indexes,len(left_groups))
                    
#           form groups between rows.
                    pairs = pairwise(vert_indexes)
                    for start,end in pairs:
                        box = left_groups.loc[start:end-1]
                        coords = (page, min(box.top),min(box.left),max(box.top1),max(box.left1))
                        boxes[coords]=(list(left_groups.loc[start:end-1,('index')]))

#  Find close boxes by seeing which align on the same x value (either top, centre or bottom)
    
    table = []
    for a, b in itertools.combinations(boxes, 2):

        a_pg, a_top, a_left, a_top1, a_left1 = a
        b_pg, b_top, b_left, b_top1, b_left1 = b
        a_centre = (a_top+a_top1)//2
        b_centre = (b_top+b_top1)//2
        if (np.isclose(a_top, b_top, atol=5)) | (np.isclose(a_centre, b_centre, atol=5)) | (np.isclose(a_top1, b_top1, atol=5)):
            table.append([boxes[a],boxes[b]])
    
#  Table list contains two lists of indexes of rows which are close together. 
#  As ordered, the indexes should be sequential.
#  If difference between one pair and next is 1, sequential. If not, reset rows

    t = (pairwise(table))
    row = 0
    for i in t:
        if (i[1][0][-1] - i[0][1][-1]) == 1:
            for r in i:
                row+=1
                num = 1
                for col in r:
                    print('indexes', col, 'row',row, 'col',num)
                    num+=1
        else:
            row = 0

    top     top1    left    middle  left1
0   73.0    141     76      282.0   489.0
1   73.0    95      614     667.0   721.0
2   95.0    117     614     683.0   753.0
3   118.0   140     614     668.0   722.0
4   140.0   162     614     715.0   816.0
5   163.0   185     614     629.0   645.0
6   254.0   272     76      118.0   160.0
7   254.0   272     614     638.0   662.0
8   279.0   298     614     703.0   792.0
9   294.0   315     76      76.0    77.0
10  294.0   315     77      296.0   516.0
11  294.0   321     614     710.0   806.0
12  313.0   334     76      167.0   259.0
13  326.0   345     614     703.0   792.0
14  341.0   361     76      147.0   219.0
15  350.0   369     614     698.0   783.0
16  373.0   392     614     715.0   817.0
17  383.0   404     76      76.0    77.0
18  383.0   404     77      276.0   476.0
19  397.0   416     614     713.0   812.0
20  410.0   430     76      158.0   241.0

....... etc......

import itertools

def pairwise(splits):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = itertools.tee(splits, 2)
    next(b, None)
    return list(zip(a, b))

def space_sort(df):
    groups = df.loc[(df_coord.table==False)].groupby('page')
    pages = {i:j[['top','top1','left','middle','left1']] for i,j in groups}
    cols = ['left','middle','left1']
    boxes = {}
    for page in pages:
        rows = {}
        c_df = pages[page]
        min_x = min(c_df.left)
        gaps = c_df.loc[df.left_diff>5]
        
        #  value count on left, middle and left1 values so we can deal with text justification.
        counts = {'left':[], 'middle':[], 'left1':[]}
        [counts[col].append(gaps[col].unique()) for col in cols if (gaps[col].value_counts()>2).any()]
        
        if len(counts['left'])>0:
            counts['left'][0] = np.insert(counts['left'][0], 0, int(min_x))

        #  search c_df for other points close to these x values.
        for col in cols:
            if len(counts[col])>0:
                for x in counts[col][0]:
                    row_spaces = {}
                    matches = c_df.loc[np.isclose(c_df[col],x, atol=5)]
                    left_groups = df_coord.loc[matches.index.values].reset_index()
                    
#           find points where line diff > 5 indicating new row. Get indexes.
                    vert_gaps = left_groups.loc[(left_groups.top - left_groups.top1.shift())>5]                    
                    vert_indexes = vert_gaps.index.values
                    vert_indexes = np.insert(vert_indexes,0,0)
                    vert_indexes = np.append(vert_indexes,len(left_groups))
                    
#           form groups between rows.
                    pairs = pairwise(vert_indexes)
                    for start,end in pairs:
                        box = left_groups.loc[start:end-1]
                        coords = (page, min(box.top),min(box.left),max(box.top1),max(box.left1))
                        boxes[coords]=(list(left_groups.loc[start:end-1,('index')]))

#  Find close boxes by seeing which align on the same x value (either top, centre or bottom)
    
    table = []
    for a, b in itertools.combinations(boxes, 2):

        a_pg, a_top, a_left, a_top1, a_left1 = a
        b_pg, b_top, b_left, b_top1, b_left1 = b
        a_centre = (a_top+a_top1)//2
        b_centre = (b_top+b_top1)//2
        if (np.isclose(a_top, b_top, atol=5)) | (np.isclose(a_centre, b_centre, atol=5)) | (np.isclose(a_top1, b_top1, atol=5)):
            table.append([boxes[a],boxes[b]])
    
#  Table list contains two lists of indexes of rows which are close together. 
#  As ordered, the indexes should be sequential.
#  If difference between one pair and next is 1, sequential. If not, reset rows

    t = (pairwise(table))
    row = 0
    for i in t:
        if (i[1][0][-1] - i[0][1][-1]) == 1:
            for r in i:
                row+=1
                num = 1
                for col in r:
                    print('indexes', col, 'row',row, 'col',num)
                    num+=1
        else:
            row = 0

The bounding box (x,y,x1,y1) is represented below as (left,top,left1,top1). Middle is the mid-point between left and left1 and left_diff is the gap between current rows starting x position (left) and previous rows finishing x1 position (left1.shift()). Width is the left to left1 size.

    top     top1    left    middle  left1   left_diff   width
0   78.0    126     54      62.0    70.0    NaN     16.0
1   78.0    123     71      94.0    118.0   1.0     47.0
2   78.0    126     125     136.0   147.0   7.0     22.0
3   78.0    123     147     215.0   283.0   0.0     136.0
4   167.0   199     54      130.0   206.0   -229.0  152.0
5   167.0   187     664     701.0   739.0   458.0   75.0
6   186.0   204     664     722.0   780.0   -75.0   116.0
7   202.0   220     664     751.0   838.0   -116.0  174.0
8   212.0   234     54      347.0   641.0   -784.0  587.0
9   212.0   237     664     737.0   811.0   23.0    147.0
10  232.0   254     54      347.0   641.0   -757.0  587.0
11  232.0   253     664     701.0   738.0   23.0    74.0
12  232.0   253     826     839.0   853.0   88.0    27.0
13  253.0   275     54      137.0   220.0   -799.0  166.0
14  268.0   286     664     717.0   770.0   444.0   106.0
15  285.0   310     54      347.0   641.0   -716.0  587.0
16  285.0   303     664     759.0   855.0   23.0    191.0
17  301.0   330     54      347.0   641.0   -801.0  587.0
18  301.0   319     664     684.0   704.0   23.0    40.0
19  301.0   319     826     839.0   853.0   122.0   27.0
20  328.0   350     54      347.0   641.0   -799.0  587.0

....... etc......

import itertools

def pairwise(splits):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = itertools.tee(splits, 2)
    next(b, None)
    return list(zip(a, b))

def space_sort(df):
    groups = df.groupby('page')
    pages = {i:j[['top','top1','left','middle','left1']] for i,j in groups}
    cols = ['left','middle','left1']
    boxes = {}
    for page in pages:
        rows = {}
        c_df = pages[page]
        min_x = min(c_df.left)
        gaps = c_df.loc[df.left_diff>5]
        
        #  value count on left, middle and left1 values so we can deal with text justification.
        counts = {'left':[], 'middle':[], 'left1':[]}
        [counts[col].append(gaps[col].unique()) for col in cols if (gaps[col].value_counts()>2).any()]
        
        if len(counts['left'])>0:
            counts['left'][0] = np.insert(counts['left'][0], 0, int(min_x))

        #  search c_df for other points close to these x values.
        for col in cols:
            if len(counts[col])>0:
                for x in counts[col][0]:
                    row_spaces = {}
                    matches = c_df.loc[np.isclose(c_df[col],x, atol=5)]
                    left_groups = df_coord.loc[matches.index.values].reset_index()
                    
#           find points where line diff > 5 indicating new row. Get indexes.
                    vert_gaps = left_groups.loc[(left_groups.top - left_groups.top1.shift())>5]                    
                    vert_indexes = vert_gaps.index.values
                    vert_indexes = np.insert(vert_indexes,0,0)
                    vert_indexes = np.append(vert_indexes,len(left_groups))
                    
#           form groups between rows.
                    pairs = pairwise(vert_indexes)
                    for start,end in pairs:
                        box = left_groups.loc[start:end-1]
                        coords = (page, min(box.top),min(box.left),max(box.top1),max(box.left1))
                        boxes[coords]=(list(left_groups.loc[start:end-1,('index')]))

#  Find close boxes by seeing which align on the same x value (either top, centre or bottom)
    
    table = []
    for a, b in itertools.combinations(boxes, 2):

        a_pg, a_top, a_left, a_top1, a_left1 = a
        b_pg, b_top, b_left, b_top1, b_left1 = b
        a_centre = (a_top+a_top1)//2
        b_centre = (b_top+b_top1)//2
        if (np.isclose(a_top, b_top, atol=5)) | (np.isclose(a_centre, b_centre, atol=5)) | (np.isclose(a_top1, b_top1, atol=5)):
            table.append([boxes[a],boxes[b]])
    
#  Table list contains two lists of indexes of rows which are close together. 
#  As ordered, the indexes should be sequential.
#  If difference between one pair and next is 1, sequential. If not, reset rows

    t = (pairwise(table))
    row = 0
    for i in t:
        if (i[1][0][-1] - i[0][1][-1]) == 1:
            for r in i:
                row+=1
                num = 1
                for col in r:
                    print('indexes', col, 'row',row, 'col',num)
                    num+=1
        else:
            row = 0

Clarified question - text extraction has already been completed prior to the code shown

Source Link

edited Jul 16, 2020 at 6:10

lawson

141
3

Determine table structure in imagepdf using whitespace between coordinates

The pagetext is converted to an image with rectangular boxes forextracted from the file and the coordinates of each stripblock of text are stored in a dataframe. For the sake of this snippet, this has already been generated and has yielded the dataframe below. This is ordered top to bottom, left to right in reading order.

import itertools

def pairwise(splits):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = itertools.tee(splits, 2)
    next(b, None)
    return list(zip(a, b))

def space_sort(df):
    groups = df.loc[(df_coord.table==False)].groupby('page')
    pages = {i:j[['top','top1','left','middle','left1']] for i,j in groups}
    cols = ['left','middle','left1']
    boxes = {}
    for page in pages:
        rows = {}
        c_df = pages[page]
        min_x = min(c_df.left)
        gaps = c_df.loc[df.left_diff>5]
        
        #  value count on left, middle and left1 values so we can deal with text justification.
        counts = {'left':[], 'middle':[], 'left1':[]}
        [counts[col].append(gaps[col].unique()) for col in cols if (gaps[col].value_counts()>2).any()]
        
        if len(counts['left'])>0:
            counts['left'][0] = np.insert(counts['left'][0], 0, int(min_x))

        #  search c_df for other points close to these x values.
        for col in cols:
            if len(counts[col])>0:
                for x in counts[col][0]:
                    row_spaces = {}
                    matches = c_df.loc[np.isclose(c_df[col],x, atol=5)]
                    left_groups = df_coord.loc[matches.index.values].reset_index()
                    
#           find points where line diff > 5 indicating new row. Get indexes.
                    vert_gaps = left_groups.loc[(left_groups.top - left_groups.top1.shift())>5]                    
                    vert_indexes = vert_gaps.index.values
                    vert_indexes = np.insert(vert_indexes,0,0)
                    vert_indexes = np.append(vert_indexes,len(left_groups))
                    
#           form groups between rows.
                    pairs = pairwise(vert_indexes)
                    for start,end in pairs:
                        box = left_groups.loc[start:end-1]
                        coords = (page, min(box.top),min(box.left),max(box.top1),max(box.left1))
                        boxes[coords]=(list(left_groups.loc[start:end-1,('index')]))

#  Find close boxes by seeing which align on the same x value (either top, centre or bottom)
    
    table = []
    for a, b in itertools.combinations(boxes, 2):

        a_pg, a_top, a_left, a_top1, a_left1 = a
        b_pg, b_top, b_left, b_top1, b_left1 = b
        a_centre = (a_top+a_top1)//2
        b_centre = (b_top+b_top1)//2
        if (np.isclose(a_top, b_top, atol=5)) | (np.isclose(a_centre, b_centre, atol=5)) | (np.isclose(a_top1, b_top1, atol=5)):
            table.append([boxes[a],boxes[b]])
    
#  Table list contains two lists of indexes of rows which are close together. 
#  As ordered, the indexes should be sequential.
#  If difference between one pair and next is 1, sequential. If not, reset rows to 1 

    t = (pairwise(table))
    row = 0
    for i in t:
        if (i[1][0][-1] - i[0][1][-1]) == 1:
            for r in i:
                row+=1
                num = 1
                for col in r:
                    print('indexes', col, 'row',row, 'col',num)
                    num+=1
        else:
            row = 0

Determine table structure in image using whitespace between coordinates

The page is converted to an image with rectangular boxes for each strip of text. For the sake of this snippet, this has already been generated and has yielded the dataframe below. This is ordered top to bottom, left to right in reading order.

import itertools

def pairwise(splits):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = itertools.tee(splits, 2)
    next(b, None)
    return list(zip(a, b))

def space_sort(df):
    groups = df.loc[(df_coord.table==False)].groupby('page')
    pages = {i:j[['top','top1','left','middle','left1']] for i,j in groups}
    cols = ['left','middle','left1']
    boxes = {}
    for page in pages:
        rows = {}
        c_df = pages[page]
        min_x = min(c_df.left)
        gaps = c_df.loc[df.left_diff>5]
        
        #  value count on left, middle and left1 values so we can deal with text justification.
        counts = {'left':[], 'middle':[], 'left1':[]}
        [counts[col].append(gaps[col].unique()) for col in cols if (gaps[col].value_counts()>2).any()]
        
        if len(counts['left'])>0:
            counts['left'][0] = np.insert(counts['left'][0], 0, int(min_x))

        #  search c_df for other points close to these x values.
        for col in cols:
            if len(counts[col])>0:
                for x in counts[col][0]:
                    row_spaces = {}
                    matches = c_df.loc[np.isclose(c_df[col],x, atol=5)]
                    left_groups = df_coord.loc[matches.index.values].reset_index()
                    
#           find points where line diff > 5 indicating new row. Get indexes.
                    vert_gaps = left_groups.loc[(left_groups.top - left_groups.top1.shift())>5]                    
                    vert_indexes = vert_gaps.index.values
                    vert_indexes = np.insert(vert_indexes,0,0)
                    vert_indexes = np.append(vert_indexes,len(left_groups))
                    
#           form groups between rows.
                    pairs = pairwise(vert_indexes)
                    for start,end in pairs:
                        box = left_groups.loc[start:end-1]
                        coords = (page, min(box.top),min(box.left),max(box.top1),max(box.left1))
                        boxes[coords]=(list(left_groups.loc[start:end-1,('index')]))

#  Find close boxes by seeing which align on the same x value (either top, centre or bottom)
    
    table = []
    for a, b in itertools.combinations(boxes, 2):

        a_pg, a_top, a_left, a_top1, a_left1 = a
        b_pg, b_top, b_left, b_top1, b_left1 = b
        a_centre = (a_top+a_top1)//2
        b_centre = (b_top+b_top1)//2
        if (np.isclose(a_top, b_top, atol=5)) | (np.isclose(a_centre, b_centre, atol=5)) | (np.isclose(a_top1, b_top1, atol=5)):
            table.append([boxes[a],boxes[b]])
    
#  Table list contains two lists of indexes of rows which are close together. 
#  As ordered, the indexes should be sequential.
# If difference between one pair and next is 1, sequential. If not, reset rows to 1
    t = (pairwise(table))
    row = 0
    for i in t:
        if (i[1][0][-1] - i[0][1][-1]) == 1:
            for r in i:
                row+=1
                num = 1
                for col in r:
                    print('indexes', col, 'row',row, 'col',num)
                    num+=1
        else:
            row = 0

Determine table structure in pdf using whitespace between coordinates

The text is extracted from the file and the coordinates of each block of text are stored in a dataframe. For the sake of this snippet, this has already been generated and has yielded the dataframe below. This is ordered top to bottom, left to right in reading order.

import itertools

def pairwise(splits):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = itertools.tee(splits, 2)
    next(b, None)
    return list(zip(a, b))

def space_sort(df):
    groups = df.loc[(df_coord.table==False)].groupby('page')
    pages = {i:j[['top','top1','left','middle','left1']] for i,j in groups}
    cols = ['left','middle','left1']
    boxes = {}
    for page in pages:
        rows = {}
        c_df = pages[page]
        min_x = min(c_df.left)
        gaps = c_df.loc[df.left_diff>5]
        
        #  value count on left, middle and left1 values so we can deal with text justification.
        counts = {'left':[], 'middle':[], 'left1':[]}
        [counts[col].append(gaps[col].unique()) for col in cols if (gaps[col].value_counts()>2).any()]
        
        if len(counts['left'])>0:
            counts['left'][0] = np.insert(counts['left'][0], 0, int(min_x))

        #  search c_df for other points close to these x values.
        for col in cols:
            if len(counts[col])>0:
                for x in counts[col][0]:
                    row_spaces = {}
                    matches = c_df.loc[np.isclose(c_df[col],x, atol=5)]
                    left_groups = df_coord.loc[matches.index.values].reset_index()
                    
#           find points where line diff > 5 indicating new row. Get indexes.
                    vert_gaps = left_groups.loc[(left_groups.top - left_groups.top1.shift())>5]                    
                    vert_indexes = vert_gaps.index.values
                    vert_indexes = np.insert(vert_indexes,0,0)
                    vert_indexes = np.append(vert_indexes,len(left_groups))
                    
#           form groups between rows.
                    pairs = pairwise(vert_indexes)
                    for start,end in pairs:
                        box = left_groups.loc[start:end-1]
                        coords = (page, min(box.top),min(box.left),max(box.top1),max(box.left1))
                        boxes[coords]=(list(left_groups.loc[start:end-1,('index')]))

#  Find close boxes by seeing which align on the same x value (either top, centre or bottom)
    
    table = []
    for a, b in itertools.combinations(boxes, 2):

        a_pg, a_top, a_left, a_top1, a_left1 = a
        b_pg, b_top, b_left, b_top1, b_left1 = b
        a_centre = (a_top+a_top1)//2
        b_centre = (b_top+b_top1)//2
        if (np.isclose(a_top, b_top, atol=5)) | (np.isclose(a_centre, b_centre, atol=5)) | (np.isclose(a_top1, b_top1, atol=5)):
            table.append([boxes[a],boxes[b]])
    
#  Table list contains two lists of indexes of rows which are close together. 
#  As ordered, the indexes should be sequential.
#  If difference between one pair and next is 1, sequential. If not, reset rows 

    t = (pairwise(table))
    row = 0
    for i in t:
        if (i[1][0][-1] - i[0][1][-1]) == 1:
            for r in i:
                row+=1
                num = 1
                for col in r:
                    print('indexes', col, 'row',row, 'col',num)
                    num+=1
        else:
            row = 0

added 2 characters in body

Source Link

edited Jul 15, 2020 at 23:57

Ben A

10.8k
5
40
103

import itertools

def pairwise(splits):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = itertools.tee(splits, 2)
    next(b, None)
    return list(zip(a, b))

def space_sort(df):
    groups = df.loc[(df_coord.table==False)].groupby('page')
    pages = {i:j[['top','top1','left','middle','left1']] for i,j in groups}
    cols = ['left','middle','left1']
    boxes = {}
    for page in pages:
        rows = {}
        c_df = pages[page]
        min_x = min(c_df.left)
        gaps = c_df.loc[df.left_diff>5]
        
        #  value count on left, middle and left1 values so we can deal with text justification.
        counts = {'left':[], 'middle':[], 'left1':[]}
        [counts[col].append(gaps[col].unique()) for col in cols if (gaps[col].value_counts()>2).any()]
        
        if len(counts['left'])>0:
            counts['left'][0] = np.insert(counts['left'][0], 0, int(min_x))

        #  search c_df for other points close to these x values.
        for col in cols:
            if len(counts[col])>0:
                for x in counts[col][0]:
                    row_spaces = {}
                    matches = c_df.loc[np.isclose(c_df[col],x, atol=5)]
                    left_groups = df_coord.loc[matches.index.values].reset_index()
                    
#           find points where line diff > 5 indicating new row. Get indexes.
                    vert_gaps = left_groups.loc[(left_groups.top - left_groups.top1.shift())>5]                    
                    vert_indexes = vert_gaps.index.values
                    vert_indexes = np.insert(vert_indexes,0,0)
                    vert_indexes = np.append(vert_indexes,len(left_groups))
                    
#           form groups between rows.
                    pairs = pairwise(vert_indexes)
                    for start,end in pairs:
                        box = left_groups.loc[start:end-1]
                        coords = (page, min(box.top),min(box.left),max(box.top1),max(box.left1))
                        boxes[coords]=(list(left_groups.loc[start:end-1,('index')]))

#  Find close boxes by seeing which align on the same x value (either top, centre or bottom)
    
    table = []
    for a, b in itertools.combinations(boxes, 2):

        a_pg, a_top, a_left, a_top1, a_left1 = a
        b_pg, b_top, b_left, b_top1, b_left1 = b
        a_centre = (a_top+a_top1)//2
        b_centre = (b_top+b_top1)//2
        if (np.isclose(a_top, b_top, atol=5)) | (np.isclose(a_centre, b_centre, atol=5)) | (np.isclose(a_top1, b_top1, atol=5)):
            table.append([boxes[a],boxes[b]])
    
#  Table list contains two lists of indexes of rows which are close together. 
#  As ordered, the indexes should be sequential.
# If difference between one pair and next is 1, sequential. If not, reset rows to 1
    t = (pairwise(table))
    row = 0
    for i in t:
        if (i[1][0][-1] - i[0][1][-1]) == 1:
            for r in i:
                row+=1
                num = 1
                for col in r:
                    print('indexes', col, 'row',row, 'col',num)
                    num+=1
        else:
            row = 0
```

import itertools

def pairwise(splits):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = itertools.tee(splits, 2)
    next(b, None)
    return list(zip(a, b))

def space_sort(df):
    groups = df.loc[(df_coord.table==False)].groupby('page')
    pages = {i:j[['top','top1','left','middle','left1']] for i,j in groups}
    cols = ['left','middle','left1']
    boxes = {}
    for page in pages:
        rows = {}
        c_df = pages[page]
        min_x = min(c_df.left)
        gaps = c_df.loc[df.left_diff>5]
        
        #  value count on left, middle and left1 values so we can deal with text justification.
        counts = {'left':[], 'middle':[], 'left1':[]}
        [counts[col].append(gaps[col].unique()) for col in cols if (gaps[col].value_counts()>2).any()]
        
        if len(counts['left'])>0:
            counts['left'][0] = np.insert(counts['left'][0], 0, int(min_x))

        #  search c_df for other points close to these x values.
        for col in cols:
            if len(counts[col])>0:
                for x in counts[col][0]:
                    row_spaces = {}
                    matches = c_df.loc[np.isclose(c_df[col],x, atol=5)]
                    left_groups = df_coord.loc[matches.index.values].reset_index()
                    
#           find points where line diff > 5 indicating new row. Get indexes.
                    vert_gaps = left_groups.loc[(left_groups.top - left_groups.top1.shift())>5]                    
                    vert_indexes = vert_gaps.index.values
                    vert_indexes = np.insert(vert_indexes,0,0)
                    vert_indexes = np.append(vert_indexes,len(left_groups))
                    
#           form groups between rows.
                    pairs = pairwise(vert_indexes)
                    for start,end in pairs:
                        box = left_groups.loc[start:end-1]
                        coords = (page, min(box.top),min(box.left),max(box.top1),max(box.left1))
                        boxes[coords]=(list(left_groups.loc[start:end-1,('index')]))

#  Find close boxes by seeing which align on the same x value (either top, centre or bottom)
    
    table = []
    for a, b in itertools.combinations(boxes, 2):

        a_pg, a_top, a_left, a_top1, a_left1 = a
        b_pg, b_top, b_left, b_top1, b_left1 = b
        a_centre = (a_top+a_top1)//2
        b_centre = (b_top+b_top1)//2
        if (np.isclose(a_top, b_top, atol=5)) | (np.isclose(a_centre, b_centre, atol=5)) | (np.isclose(a_top1, b_top1, atol=5)):
            table.append([boxes[a],boxes[b]])
    
#  Table list contains two lists of indexes of rows which are close together. 
#  As ordered, the indexes should be sequential.
# If difference between one pair and next is 1, sequential. If not, reset rows to 1
    t = (pairwise(table))
    row = 0
    for i in t:
        if (i[1][0][-1] - i[0][1][-1]) == 1:
            for r in i:
                row+=1
                num = 1
                for col in r:
                    print('indexes', col, 'row',row, 'col',num)
                    num+=1
        else:
            row = 0
```

import itertools

def pairwise(splits):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = itertools.tee(splits, 2)
    next(b, None)
    return list(zip(a, b))

def space_sort(df):
    groups = df.loc[(df_coord.table==False)].groupby('page')
    pages = {i:j[['top','top1','left','middle','left1']] for i,j in groups}
    cols = ['left','middle','left1']
    boxes = {}
    for page in pages:
        rows = {}
        c_df = pages[page]
        min_x = min(c_df.left)
        gaps = c_df.loc[df.left_diff>5]
        
        #  value count on left, middle and left1 values so we can deal with text justification.
        counts = {'left':[], 'middle':[], 'left1':[]}
        [counts[col].append(gaps[col].unique()) for col in cols if (gaps[col].value_counts()>2).any()]
        
        if len(counts['left'])>0:
            counts['left'][0] = np.insert(counts['left'][0], 0, int(min_x))

        #  search c_df for other points close to these x values.
        for col in cols:
            if len(counts[col])>0:
                for x in counts[col][0]:
                    row_spaces = {}
                    matches = c_df.loc[np.isclose(c_df[col],x, atol=5)]
                    left_groups = df_coord.loc[matches.index.values].reset_index()
                    
#           find points where line diff > 5 indicating new row. Get indexes.
                    vert_gaps = left_groups.loc[(left_groups.top - left_groups.top1.shift())>5]                    
                    vert_indexes = vert_gaps.index.values
                    vert_indexes = np.insert(vert_indexes,0,0)
                    vert_indexes = np.append(vert_indexes,len(left_groups))
                    
#           form groups between rows.
                    pairs = pairwise(vert_indexes)
                    for start,end in pairs:
                        box = left_groups.loc[start:end-1]
                        coords = (page, min(box.top),min(box.left),max(box.top1),max(box.left1))
                        boxes[coords]=(list(left_groups.loc[start:end-1,('index')]))

#  Find close boxes by seeing which align on the same x value (either top, centre or bottom)
    
    table = []
    for a, b in itertools.combinations(boxes, 2):

        a_pg, a_top, a_left, a_top1, a_left1 = a
        b_pg, b_top, b_left, b_top1, b_left1 = b
        a_centre = (a_top+a_top1)//2
        b_centre = (b_top+b_top1)//2
        if (np.isclose(a_top, b_top, atol=5)) | (np.isclose(a_centre, b_centre, atol=5)) | (np.isclose(a_top1, b_top1, atol=5)):
            table.append([boxes[a],boxes[b]])
    
#  Table list contains two lists of indexes of rows which are close together. 
#  As ordered, the indexes should be sequential.
# If difference between one pair and next is 1, sequential. If not, reset rows to 1
    t = (pairwise(table))
    row = 0
    for i in t:
        if (i[1][0][-1] - i[0][1][-1]) == 1:
            for r in i:
                row+=1
                num = 1
                for col in r:
                    print('indexes', col, 'row',row, 'col',num)
                    num+=1
        else:
            row = 0

Source Link

asked Jul 15, 2020 at 21:36

lawson

141
3

Loading

Stack Exchange Network

Return to Question

Determine table structure in imagepdf using whitespace between coordinates

Determine table structure in image using whitespace between coordinates

Determine table structure in pdf using whitespace between coordinates