
    k?1i[                     R   d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	m
Z
mZmZmZ ddlmZ ddlmZmZmZmZmZmZ dZdZdZdZeeeeef   f   Zed	eeef   f   Zerdd
lmZ eefdedededefdZ efdedededefdZ!dedededededefdZ"efdede#defdZ$efdede#defdZ%	 d.dedededefdZ&dede	e   fdZ'de	e   de	e	e      fd Z( G d! d"e)      Z* G d# d$e*      Z+ G d% d&e)      Z,g d'Z-g d(Z. G d) d*e/      Z0 e0d       Z1e G d+ d	             Z2 G d, d-e)      Z3y)/    N)	dataclass)
itemgetter)TYPE_CHECKINGAnyDictListOptionalSetTupleUnion   )utils)T_bboxT_numT_obj
T_obj_iter
T_obj_listT_point   TableSettings)Pageedgesx_tolerancey_tolerancereturnc                     g g d}| D ]  }||d      j                  |        t        j                  |d   d|      }t        j                  |d   d|      }||z   S )zs
    Given a list of edges, snap any within `tolerance` pixels of one another
    to their positional average.
    vhorientationr   x0r   top)appendr   snap_objects)r   r   r   by_orientatione	snapped_v	snapped_hs          Z/home/www/therecruiter.miabetepe.com/venv/lib/python3.12/site-packages/pdfplumber/table.py
snap_edgesr*      st     352,>N 3q'(//23 "">##6kJI"">##6{KIy      r    	tolerancec                 B   |dk(  rd\  }}n|dk(  rd\  }}nt        d      t        t        | t        |                  }|d   g}|dd	 D ]P  }|d
   }||   ||   |z   k  r*||   ||   kD  s"t	        j
                  ||||         |d
<   @|j                  |       R |S )z
    Given a list of edges along the same infinite line, join those that
    are within `tolerance` pixels of one another.
    r   )r!   x1r   )r"   bottomzOrientation must be 'v' or 'h'keyr   r   N)
ValueErrorlistsortedr   r   resize_objectr#   )	r   r    r,   min_propmax_propsorted_edgesjoinedr&   lasts	            r)   join_edge_groupr<   '   s     c'(		,(9::u*X*>?@L1oF!" bzX;4>I56{T(^+"00x8Mr
 MM! Mr+   snap_x_tolerancesnap_y_tolerancejoin_x_tolerancejoin_y_tolerancec                    dt         dt        t        t        f   fd}|dkD  s|dkD  rt	        | ||      } t        | |      }t        j                  ||      }fd|D        }t        t        j                  |       } | S )z|
    Using the `snap_edges` and `join_edge_group` methods above,
    merge a list of edges into a more "seamless" list.
    edger   c                 .    | d   dk(  rd| d   fS d| d   fS )Nr    r   r"   r   r!    )rB   s    r)   	get_groupzmerge_edges.<locals>.get_groupP   s-    #%e%%d$$r+   r   r0   c              3   X   K   | ]!  \  }}t        ||d    |d    dk(  rn       # yw)r   r   N)r<   ).0kitemsr?   r@   s      r)   	<genexpr>zmerge_edges.<locals>.<genexpr>[   s=       Au 	1Q4adck*?O	
s   '*)
r   r   strr   r*   r5   	itertoolsgroupbyr4   chain)	r   r=   r>   r?   r@   rE   _sortededge_groupsedge_gens	      ``    r)   merge_edgesrR   D   s    % %%U
"3 % !/!35"24DEU	*G##G;K $	H (+,ELr+   wordsword_thresholdc           
         t        j                  | t        d      d      }t        fd|      }t	        t        t         j                  |            }t        |      dk(  rg S t        t        t        d      |            }t        t        t        d      |            }g }|D ])  }||||d   |d   ||z
  dd|||d	   |d	   ||z
  ddgz  }+ |S )
zi
    Find (imaginary) horizontal lines that connect the tops
    of at least `word_threshold` words.
    r"   r   c                      t        |       k\  S NlenxrT   s    r)   <lambda>z"words_to_edges_h.<locals>.<lambda>m       c!f&> r+   r   r!   r.   r   )r!   r.   r"   r/   widthr    r/   )
r   cluster_objectsr   filterr4   mapobjects_to_rectrY   minmax)	rS   rT   by_toplarge_clustersrectsmin_x0max_x1r   rs	    `       r)   words_to_edges_hrk   e   s     ""5*U*;Q?F>GNU**N;<E
5zQ	Z%u-.FZ%u-.FE 
 xE(&" {H+&"
 	

0 Lr+   c           
      r   t        j                  | t        d      d      }t        j                  | t        d      d      }dt        dt        fd}t        j                  | |d      }||z   |z   }t        |d       }t        fd	|      }t        t        t         j                  |            }	g }
|	D ]*  t        fd
|
D              }|r|
j                         , t        |
      dk(  rg S t        t         j                  |
      }t        t        |t        d                  }t        t        t        d      |            }t        t        t        d      |            }t        t        t        d      |            }|D cg c]  }|d   |d   ||||z
  dd c}||||||z
  ddgz   S c c}w )zy
    Find (imaginary) vertical lines that connect the left, right, or
    center of at least `word_threshold` words.
    r!   r   r.   wordr   c                 0    t        | d   | d   z         dz  S )Nr!   r.      )float)rm   s    r)   
get_centerz$words_to_edges_v.<locals>.get_center   s    T$Z$t*,-11r+   c                     t        |        S rW   rX   )r[   s    r)   r\   z"words_to_edges_v.<locals>.<lambda>   s    c!fW r+   r0   c                      t        |       k\  S rW   rX   rZ   s    r)   r\   z"words_to_edges_v.<locals>.<lambda>   r]   r+   c              3   J   K   | ]  }t        j                  |        y wrW   )r   get_bbox_overlap)rG   cbboxs     r)   rJ   z#words_to_edges_v.<locals>.<genexpr>   s     P!e,,T15Ps    #r   r"   r/   r   r!   r.   r"   r/   heightr    )r   r_   r   r   r   r5   r`   r4   ra   objects_to_bboxanyr#   rY   bbox_to_rectrd   rc   )rS   rT   by_x0by_x1rq   	by_centerclusterssorted_clustersrf   bboxescondensed_bboxesoverlapcondensed_rectssorted_rectsri   min_top
max_bottombrw   s    `                @r)   words_to_edges_vr      s    !!%D)91=E!!%D)91=E2 25 2 %%eZ;Iu}y(H X+<=O>PN #e++^<=F &( *P?OPP##D)*
 !	%,,.>?OJt4DEFLZ%|45F#j'67GSH-|<=J 
  D'D'  7*	

   7*	
		  
s   
F4c           	         i }dD cg c]  t        t        fd|              c}\  }}t        |t        dd            D ]  }t        |t        dd            D ]  }|d   |d   |z   k  s|d   |d   |z
  k\  s!|d   |d   |z
  k\  s0|d   |d   |z   k  s?|d   |d   f}	|	|vrg g d||	<   ||	   d   j	                  |       ||	   d	   j	                  |         |S c c}w )
zi
    Given a list of edges, return the points at which they intersect
    within `tolerance` pixels.
    r   c                     | d   k(  S )Nr    rD   )r[   os    r)   r\   z(edges_to_intersections.<locals>.<lambda>   s    a.!3 r+   r!   r"   r0   r/   r.   r   r   )r4   r`   r5   r   r#   )
r   r   r   intersectionsr   v_edgesh_edgesr   r   vertexs
       `     r)   edges_to_intersectionsr      s'    &(MFPABV3U;<GW GD%!89 5Zt%<= 	5A5ah45x[QuX%;<tW4;!67tW4;!67D'1U8,.242,>M&)f%c*11!4f%c*11!4	55 !s   C(r   c                 L    dt         dt         dt        f fdt        t         j	                                     t              dt        t            dt        dt        t           f fdfdt        t                    D        }t        t        d	|            S )
a8  
    Given a list of points (`intersections`), return all rectangular "cells"
    that those points describe.

    `intersections` should be a dictionary with (x0, top) tuples as keys,
    and a list of edge objects as values. The edge objects should correspond
    to the edges that touch the intersection.
    p1p2r   c                 4   dt         dt        t           fd}| d   |d   k(  r5 ||    d         j                   ||   d               }t	        |      ry| d   |d   k(  r5 ||    d         j                   ||   d               }t	        |      ryy	)
Nr   r   c                 H    t        t        t        j                  |             S rW   )setra   r   obj_to_bbox)r   s    r)   edges_to_setzCintersections_to_cells.<locals>.edge_connects.<locals>.edges_to_set   s    s5,,e455r+   r   r   Tr   r   F)r   r
   r   intersectionrY   )r   r   r   commonr   s       r)   edge_connectsz-intersections_to_cells.<locals>.edge_connects   s    	6
 	6s6{ 	6 a5BqE>!-"3C"89FF]2.s34F 6{a5BqE>!-"3C"89FF]2.s34F 6{r+   pointsic                 f   |dz
  k(  ry | |   }| |dz   d  }|D cg c]  }|d   |d   k(  s| }}|D cg c]  }|d   |d   k(  s| }}|D ]U  } 
||      s|D ]D  } 
||      s|d   |d   f}	|	v s 
|	|      s& 
|	|      s0|d   |d   |	d   |	d   fc c S  W y c c}w c c}w )Nr   r   rD   )r   r   ptrestr[   belowrightbelow_ptright_ptbottom_rightr   r   n_pointss             r)   find_smallest_cellz2intersections_to_cells.<locals>.find_smallest_cell
  s   1AYa!eg 2qAaDBqEM22 2qAaDBqEM22 	LH X.! L$R2 (Xa[9 "]2%lH=%lH= qE2a5,q/<?KKL		L" ' 32s   B)B)B.B.c              3   0   K   | ]  } |        y wrW   rD   )rG   r   r   r   s     r)   rJ   z)intersections_to_cells.<locals>.<genexpr>%  s     J!"61-Js   N)r   boolr4   r5   keysrY   r   intr	   r   ranger`   )r   cell_genr   r   r   r   s   ` @@@@r)   intersections_to_cellsr      s    ' w 4 & &++-./F6{H4= S Xf=M 6 KuS[7IJHtX&''r+   cellsc                 N   dt         dt        t        t        t        t        f   fd}t        |       }t	               g }g }t        |      rt        |      }t        |      D ]  } ||      }t        |      dk(  r1t	        |      z  |j                  |       |j                  |       Jt        fd|D              }|dkD  sdt	        |      z  |j                  |       |j                  |        t        |      |k(  r:|j                  t        |             j                          |j                          t        |      rt        |      r|j                  t        |             t        |d       }	|	D 
cg c]  }
t        |
      dkD  s|
 }}
|S c c}
w )	z
    Given a list of bounding boxes (`cells`), return a list of tables that
    hold those cells most simply (and contiguously).
    rw   r   c                 ,    | \  }}}}||f||f||f||ffS rW   rD   )rw   r!   r"   r.   r/   s        r)   bbox_to_cornersz(cells_to_tables.<locals>.bbox_to_corners/  s/    "CVS	B<"cRLAAr+   r   c              3   &   K   | ]  }|v  
 y wrW   rD   )rG   rv   current_cornerss     r)   rJ   z"cells_to_tables.<locals>.<genexpr>H  s     "NA1#7"Ns   c                 &    t        d | D              S )Nc              3   0   K   | ]  }|d    |d   f  yw)r   r   NrD   )rG   rv   s     r)   rJ   z4cells_to_tables.<locals>.<lambda>.<locals>.<genexpr>a  s     .G!ad|.Gs   )rc   )ts    r)   r\   z!cells_to_tables.<locals>.<lambda>a  s    3.GQ.G+G r+   r0   r   )r   r   r   r4   r   rY   r#   removesumclearr5   )r   r   remaining_cellscurrent_cellstablesinitial_cell_countcellcell_cornerscorner_countrO   r   filteredr   s               @r)   cells_to_tablesr   )  s   Bf Bw'/Q)R B 5kO
 %(EO"$MF
o
 /) 	1D*40L=!Q&3|#44$$T*&&t,  #"N"NN  !##s<'88O!((.#**40#	1( }!33MM$}-.!!#!5 o
> =d=)* V!GHG"1ac!fqj1H1O 2s   F"F"c                   $    e Zd Zdeee      fdZy)	CellGroupr   c                 X   || _         t        t        t        d      t	        d |                  t        t        t        d      t	        d |                  t        t        t        d      t	        d |                  t        t        t        d      t	        d |                  f| _        y Nr   r   ro   r   )r   rc   ra   r   r`   rd   rw   )selfr   s     r)   __init__zCellGroup.__init__g  sz    
JqM6$#678JqM6$#678JqM6$#678JqM6$#678	
	r+   N)__name__
__module____qualname__r   r	   r   r   rD   r+   r)   r   r   f  s    
d8F#34 
r+   r   c                       e Zd Zy)RowNr   r   r   rD   r+   r)   r   r   q      r+   r   c                   v    e Zd Zdddee   fdZedefd       Zedee   fd       Z	de
deeee         fd	Zy
)Tablepager   r   c                      || _         || _        y rW   )r   r   )r   r   r   s      r)   r   zTable.__init__v  s    	
r+   r   c           
         | j                   }t        t        t        d      |            t        t        t        d      |            t	        t        t        d      |            t	        t        t        d      |            fS r   )r   rc   ra   r   rd   )r   rv   s     r)   rw   z
Table.bboxz  sa    JJJqM1%&JqM1%&JqM1%&JqM1%&	
 	
r+   c                    t        | j                  t        dd            }t        t        t	        t        t        d      | j                                          }g }t        j                  |t        d            D ]Q  \  }}|D ci c]  }|d   |
 }}t        |D cg c]  }|j                  |       c}      }	|j                  |	       S |S c c}w c c}w )Nr   r   r0   )r5   r   r   r4   r   ra   rL   rM   r   getr#   )
r   rO   xsrowsy	row_cellsr   xdictr[   rows
             r)   r   z
Table.rows  s    Aq)9:&SA

;<=>%--gz!}E 	LAy/89tT!Wd]9E9R0uyy|01CKK	  :0s   CC
kwargsc           	      ,   | j                   j                  }g }dt        dt        dt        fd}| j
                  D ]  }g }|D cg c]  } |||j                        s| }}|j                  D ]  }	|	d }
nm|D cg c]  } |||	      s| }}t        |      rG|	d   |d<   |	d   |d<   d	|v r|	d
   |	d   z
  |d<   |	d   |	d   z
  |d<   t        j                  |fi |}
nd}
|j                  |
        |j                  |        |S c c}w c c}w )Ncharrw   r   c                     | d   | d   z   dz  }| d   | d   z   dz  }|\  }}}}t        ||k\  xr ||k  xr ||k\  xr ||k        S )Nr"   r/   ro   r!   r.   )r   )r   rw   v_midh_midr!   r"   r.   r/   s           r)   char_in_bboxz#Table.extract.<locals>.char_in_bbox  sm    %[4>1Q6E$Z$t*,1E"&BR"V52:VESLVuv~ r+   r   x_shiftr   y_shiftlayoutro   layout_widthr   layout_height )r   charsr   r   r   r   rw   r   rY   r   extract_textr#   )r   r   r   	table_arrr   r   arrr   	row_charsr   	cell_text
cell_charss               r)   extractzTable.extract  sN   				u 	F 	t 	 99 	"CC*/P$<chh3OPIP		 &< $I *3"!%l46N"J " :,0Gy),0Gy)#v-59!WtAw5FF>26:1gQ6GF?3$)$6$6z$LV$L	$&	

9%#&$ S!-	"0 - Q"s   DD;D
DN)r   r   r   r   r   r   propertyrw   r   r   r   r	   rK   r   rD   r+   r)   r   r   u  st    V DL  
f 
 
 d3i  % %T(3--@(A %r+   r   )lineslines_stricttextexplicit)snap_tolerancer=   r>   join_tolerancer?   r@   edge_min_lengthmin_words_verticalmin_words_horizontalintersection_toleranceintersection_x_toleranceintersection_y_tolerancec                       e Zd Zy)
UnsetFloatNr   rD   r+   r)   r  r    r   r+   r  c                   `   e Zd ZU dZeed<   dZeed<   dZee	e
eef         ed<   dZee	e
eef         ed<   eZeed<   eZeed<   eZeed	<   eZeed
<   eZeed<   eZeed<   dZeed<   eZeed<   eZeed<   dZeed<   eZeed<   eZeed<   dZ ee!ee"f      ed<   ddZ#e$dee%   dd fd       Z&y)r   r   vertical_strategyhorizontal_strategyNexplicit_vertical_linesexplicit_horizontal_linesr   r=   r>   r   r?   r@   r   r   r   r   r   r   r  text_settingsr   c           	      &   t         D ]$  }t        | |      xs ddk  st        d| d       dD ]=  }t        | |dz         }|t        vst        | ddj	                  t               d       | j
                  i | _        d	D ]:  }|| j
                  vs| j
                  j                  d
d      | j
                  |<   < d
| j
                  v r| j
                  d
= dD ]/  \  }}t        | |      t        u st        | |t        | |             1 | S )a  Clean up user-provided table settings.

        Validates that the table settings provided consists of acceptable values and
        returns a cleaned up version. The cleaned up version fills out the missing
        values with the default values in the provided settings.

        TODO: Can be further used to validate that the values are of the correct
            type. For example, raising a value error when a non-boolean input is
            provided for the key ``keep_blank_chars``.

        :param table_settings: User-provided table settings.
        :returns: A cleaned up version of the user-provided table settings.
        :raises ValueError: When an unrecognised key is provided.
        r   zTable setting 'z' cannot be negative)
horizontalvertical	_strategyz_strategy must be one of{,})r   r   r,   r   ))r=   r   )r>   r   )r?   r   )r@   r   )r   r   )r  r   )	NON_NEGATIVE_SETTINGSgetattrr3   TABLE_STRATEGIESjoinr	  r   UNSETsetattr)r   settingr    strategyattrfallbacks         r)   __post_init__zTableSettings.__post_init__  sO     - 	RGg&+!q0 ?7);O!PQQ	R 6 	Kt[;%>?H// "m $"234B8 	 %!#D 3 	RD4---+/+=+=+A+A+q+Q""4(	R $,,,"";/
 		=ND( tT"e+dGD($;<		= r+   settingsc                     | |        S t        ||       r|S t        |t              r?i }i }|j                         D ]  \  }}|d d dk(  r	|||dd  <   |||<    ||d<    | di |S t        d|       )N   text_r	  zCannot resolve settings: rD   )
isinstancedictrI   r3   )clsr  core_settingsr	  rH   r   s         r)   resolvezTableSettings.resolve  s    5L#&O$'MM ( )1Ra5G#+,M!AB%('(M!$	)
 .;M/*'''8
CDDr+   )r   r   )'r   r   r   r  rK   __annotations__r  r  r	   r   r   r   r   r  DEFAULT_SNAP_TOLERANCEr   r  r=   r>   DEFAULT_JOIN_TOLERANCEr   r?   r@   r   DEFAULT_MIN_WORDS_VERTICALr   r   DEFAULT_MIN_WORDS_HORIZONTALr   r   r   r  r	  r   r   r  classmethodT_table_settingsr#  rD   r+   r)   r   r     s   $s$&&CGXd5+>&?@GEIxU5%<-@(ABI2NE2#e##e#2NE2#e##e#OU88 <#<$%E%&+e+&+e+.2M8DcN+23j Ex(89 Eo E Er+   c                   4    e Zd ZdZd	dddee   fdZdefdZy)
TableFindera0  
    Given a PDF page, find plausible table structures.

    Largely borrowed from Anssi Nurminen's master's thesis:
    http://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3

    ... and inspired by Tabula:
    https://github.com/tabulapdf/tabula-extractor/issues/16
    Nr   r   r  c                    || _         t        j                  |      | _        | j	                         | _        t        | j
                  | j                  j                  | j                  j                        | _	        t        | j                        | _        t        | j                        D cg c]  }t        | j                   |       c}| _        y c c}w rW   )r   r   r#  r  	get_edgesr   r   r   r  r   r   r   r   r   r   )r   r   r  
cell_groups       r)   r   zTableFinder.__init__7  s    	%--h7^^%
3JJMM22MM22

 ,D,>,>?
;J4::;V
-7E$))Z(
 
s   ,Cr   c           
         | j                   }dD ]I  }t        ||dz         }|dk(  st        |d|z   dz         }t        |      dk  s9t        d| d| d	       |j                  }|j
                  }|d
k(  s|d
k(  r* | j                  j                  di |j                  xs i }g }|j                  xs g D ]  }	t        |	t              r5t        j                  |	      D ]  }
|
d   dk(  s|j                  |
        H|j                  |	|	| j                  j                  d   | j                  j                  d   | j                  j                  d   | j                  j                  d   z
  dd        |dk(  r+t        j                   | j                  j"                  d      }nV|dk(  r-t        j                   | j                  j"                  dd      }n$|d
k(  rt%        |j&                        }n|dk(  rg }|z   }g }|j(                  xs g D ]  }	t        |	t              r5t        j                  |	      D ]  }
|
d   dk(  s|j                  |
        H|j                  | j                  j                  d   | j                  j                  d   | j                  j                  d   | j                  j                  d   z
  |	|	dd        |dk(  r+t        j                   | j                  j"                  d      }nV|dk(  r-t        j                   | j                  j"                  dd      }n$|d
k(  rt+        |j,                        }n|dk(  rg }|z   }t/        |      t/        |      z   }t1        ||j2                  |j4                  |j6                  |j8                        }t        j                   ||j:                        S )N)r  r  r  r   	explicit__linesro   zIf z"_strategy == 'explicit', explicit_zD_lines must be specified as a list/tuple of two or more floats/ints.r   r    r   r   r   rx   r   r   line)	edge_type)rT   r   r   )r!   r.   r^   r"   r/   r    )r=   r>   r?   r@   )
min_lengthrD   )r  r  rY   r3   r  r  r   extract_wordsr	  r  r  r   r   obj_to_edgesr#   rw   filter_edgesr   r   r   r  rk   r   r4   rR   r=   r>   r?   r@   r   )r   r  r    r  r   v_strath_stratrS   
v_explicitdescr&   v_baser   
h_explicith_baser   r   s                    r)   r.  zTableFinder.get_edgesE  s   ==5 
	Kx{)BCH:%+*Ch*NOu:>$k] +$$/= 1'( 
	 ,,..f6 1+DII++Mx/E/E/KME
44: 	D$%++D1 -A'3."))!,- !!""#yy~~a0"&)).."3"&)).."3diinnQ6G"G'*		" g''		=F&''		OF%eH<W<WXF
"FZ
66<" 	D$%++D1 -A'3."))!,- !!"iinnQ/"iinnQ/!%!2TYY^^A5F!F#"&'*		" g''		=F&''		OF%h&C&CF 
"FZQ$q'!%66%66%66%66
 !!%H4L4LMMr+   rW   )	r   r   r   __doc__r	   r*  r   r   r.  rD   r+   r)   r,  r,  ,  s0    
V 
x8H/I 
[N: [Nr+   r,  )r   r   )4rL   dataclassesr   operatorr   typingr   r   r   r   r	   r
   r   r   r   r   _typingr   r   r   r   r   r   r%  r&  r'  r(  rK   T_intersectionsr*  r   r   r*   r<   rR   r   rk   r   r   r   r   objectr   r   r   r  r  rp   r  r  r   r,  rD   r+   r)   <module>rG     s2    !  N N N  J J     wS*_ 556$sCx.89 
 0/!!! ! 	!& =S$'49:  	
  D .J(('*((X .H<<'*<<@ EF$)<A6<(/ <(d6l <(~:4< :Df,> :z
 
	) 	?F ?D A   	 	 	1 YE YE YExtN& tNr+   