
    ,h+                        d Z ddlZddlZddlZddlZddlmZ ddlZddl	m
Z
mZ ddlmZ ddlmZ ddlmZmZ ddlmZmZmZmZ dd	lmZmZmZ dd
lmZ  ej:                           ej<                  d      Zd Z ddZ!d Z"ddZ#	 	 ddZ$ ed      Z% ed      Z&	 	 ddZ'	 	 ddZ(d Z)ddZ*e+dk(  r ejX                   e*              yy)z#Extract pdf structure in XML format    N)ArgumentParser)PDFDocumentPDFNoOutlines)PDFPage)	PDFParser)PDFObjectNotFoundPDFValueError)	PDFStream	PDFObjRefresolve1stream_value)	PSKeyword	PSLiteralLIT)isnumberz&[\000-\037&<>()"\042\047\134\177-\377]c                     t         j                  r&t        | t         j                        rt	        | d      } t
        j                  d |       S )Nzlatin-1c                 <    dt        | j                  d            z  S )Nz&#%d;r   )ordgroup)ms    V/var/www/html/Resume-Scraper/venv/lib/python3.12/site-packages/../../../bin/dumppdf.py<lambda>ze.<locals>.<lambda>   s    3qwwqz?!:     )sixPY3
isinstancebinary_typestrESC_PATsub)ss    r   er"      s4    
ww:a19;;:A>>r   c                    || j                  d       y t        |t              r| j                  dt        |      z         t	        j
                  |      D ]G  \  }}| j                  d|z         | j                  d       t        | |       | j                  d       I | j                  d       y t        |t              rS| j                  dt        |      z         |D ]  }t        | |       | j                  d       ! | j                  d	       y t        |t        j                  t        j                  f      r)| j                  d
t        |      t        |      fz         y t        |t              r|dk(  r | j                  |j                                y |dk(  r | j                  |j                                y | j                  d       t        | |j                         | j                  d       |dk(  r8|j                         }| j                  dt        |      t        |      fz         | j                  d       y t        |t              r| j                  d|j                   z         y t        |t"              r| j                  d|j$                  z         y t        |t&              r| j                  d|j$                  z         y t)        |      r| j                  d|z         y t+        |      )Nz<null />z<dict size="%d">
z<key>%s</key>
z<value>z	</value>
z</dict>z<list size="%d">

z</list>z<string size="%d">%s</string>rawbinaryz<stream>
<props>
z

</props>
textz<data size="%d">%s</data>
z	</stream>z<ref id="%d" />z<keyword>%s</keyword>z<literal>%s</literal>z<number>%s</number>)writer   dictlenr   	iteritemsdumpxmlliststring_typesr   r"   r
   get_rawdataget_dataattrsr   objidr   namer   r   	TypeError)outobjcodeckvdatas         r   r,   r,      sg   
{		*#t		&S12mmC( 	$FQII'!+,IIi COIIl#		$
 			)#t		&S12 	ACOIIdO	 			)#((#//:;		1SXqv4FFG#y!E>IIcoo'( 	 hIIclln% 	 II+,C#IIn%||~		73t9ag:NNOIIk"#y!		#cii/0#y!		)CHH45#y!		)CHH45}		'#-.
C.r   c                     |j                   D ]:  }| j                  d       t        | |j                         | j                  d       < y )Nz
<trailer>
z
</trailer>

)xrefsr(   r,   trailer)r5   docxrefs      r   dumptrailersr@   Z   sA    		 &		- T\\"		$%& r   c                    t               }| j                  d       |j                  D ]u  }|j                         D ]`  }||v r|j	                  |       	 |j                  |      }|.| j                  d|z         t        | ||       | j                  d       b w t        | |       | j                  d       y # t        $ r}t        d|z         Y d }~d }~ww xY w)Nz<pdf>z<object id="%d">
r7   z
</object>

znot found: %rz</pdf>)
setr(   r<   
get_objidsaddgetobjr,   r   printr@   )r5   r>   r7   visitedr?   r2   r6   r"   s           r   dumpallobjsrI   b   s    eGIIg		 +__& 
	+EKK+jj';		.67S.		+,
	++ cIIh
	 % +o)**+s   B>)3B>>	CCCc                    t        |d      }t        |      }	t        |	|      t        d t	        t        j                        d      D              }
fd}	 j                         }| j                  d       |D ]  \  }}}}}d }|r ||      }|
|d   j                     }nc|ra|}t        |t              rO|j                  d      }|r<t        |      dk(  r.|j                  d	      r ||d	         }|
|d   j                     }t        |      j                  d
d      }| j                  d|d|d       |.| j                  d       t        | |       | j                  d       || j                  d|z         | j                  d        | j                  d       |	j#                          |j#                          y # t         $ r Y ,w xY w)Nrbc              3   >   K   | ]  \  }}|j                   |f  y wN)pageid).0pagenopages      r   	<genexpr>zdumpoutline.<locals>.<genexpr>{   s!      =>FD$++v& =s      c                 0   t        | t              rt        j                  |             } n4t        | t              r$t        j                  | j
                              } t        | t              r| d   } t        | t              r| j                         } | S )ND)	r   r   r   get_destr   r3   r)   r   resolve)destr>   s    r   resolve_destz!dumpoutline.<locals>.resolve_dest~   sn    dC CLL./Di(CLL34DdD!9DdI&<<>Dr   z<outlines>
r   Sz/'GoTo'rU   zutf-8xmlcharrefreplacez<outline level="z	" title="z">
z<dest>z</dest>
z<pageno>%r</pageno>
z</outline>
z</outlines>
)openr   r   r)   	enumerater   create_pagesget_outlinesr(   r2   r   getreprr"   encoder,   r   close)outfpfnameobjidspagenospassworddumpallr7   
extractdirfpparserpagesrY   outlinesleveltitlerX   aserP   actionsubtyper!   r>   s                         @r   dumpoutlineru   v   s   	eT	Br]F
fh
'C =g2237;= =E	##%N#+3 	('UE4BF#D)tAw}}-fd+$jjoG4=K#?FJJE!+F3K8!&tAw}}!5%)<=AKKuaHIH%t$K(!3f<=KK'+	(, 	O$ LLNHHJ
	  s   EG 	GGFilespecEmbeddedFilec                 J   fd}t        |d      }	t        |	      }
t        |
|      j                  D ]Y  }|j	                         D ]D  }j                  |      }t        |t              s%|j                  d      t        u s= ||       F [ |	j                          y )Nc                 P   t         j                  j                  | d   xs | d         }| d   d   }j                  |j                        }t        |t              st        d|z        |j                  d      t        urt        d|z        t         j                  j                  |      }t         j                  j                  |      rt        d|z        t        d|z         t        |d	      }|j                  |j!                                |j#                          y )
NUFFEFz:unable to process PDF: reference for %r is not a PDFStreamTypez>unable to process PDF: reference for %r is not an EmbeddedFilezfile exists: %rzextracting: %rwb)ospathbasenamerF   r2   r   r
   r	   r`   LITERAL_EMBEDDEDFILEjoinexistsIOErrorrG   r\   r(   r0   rc   )r6   filenamefilereffileobjr   r5   r>   rj   s         r   extract1z!extractembedded.<locals>.extract1   s   77##CI$9S:d)C.**W]]+'9-L  ;;v&::P  ww||J177>>$+d233%&4		'""$%		r   rK   r}   )r\   r   r   r<   rD   rF   r   r)   r`   LITERAL_FILESPECrc   )rd   re   rf   rg   rh   ri   r7   rj   r   rk   rl   r?   r2   r6   r>   s          `      @r   extractembeddedr      s    * 
eT	Br]F
fh
'C		 __& 	E**U#C#t$<L)L	
 HHJ
r   c                    t        |d      }t        |      }	t        |	|      }
|r&|D ]!  }|
j                  |      }t	        | ||       # |rnt        t        j                  |
            D ]M  \  }}||v s|r+|j                  D ]  }t        |      }t	        | ||        8t	        | |j                         O |rt        | |
|       |s|s|st        | |
       |j                          |dvr| j                  d       y )NrK   rB   )r%   r&   r$   )r\   r   r   rF   r,   r]   r   r^   contentsr   r1   rI   r@   rc   r(   )rd   re   rf   rg   rh   ri   r7   rj   rk   rl   r>   r2   r6   rP   rQ   s                  r   dumppdfr      s    	eT	Br]F
fh
'C 	-E**U#CE3e,	- '(<(<S(AB 	/NVT #}} 9*3/s%89 E4::.	/ E3e,WwUC HHJ%%D
r   c                  "   t        t        d      } | j                  dt        d dd       | j                  ddd	d
d       | j	                         }|j                  ddd	d
d       |j                  ddt        d       | j                  dd      }|j                  dt        d dd       |j                  ddt        d       |j                  ddt        d       |j                  dd d	d
d!       |j                  d"d#t        d$d%&       | j                  d'd(      }|j                  d)d*t        d+d,&       |j	                         }|j                  d-d.d	d
d/       |j                  d0d1d	d
d2       |j                  d3d4d	d
d5       | S )6NT)descriptionadd_helpfiles+zOne or more paths to PDF files.)typedefaultnargshelpz--debugz-dF
store_truezUse debug logging level.)r   rs   r   z--extract-tocz-TzExtract structure of outlinez--extract-embeddedz-EzExtract embedded files)r   r   ParserzUsed during PDF parsing)r   z--page-numbersz0A space-seperated list of page numbers to parse.z	--pagenosz-pzA comma-separated list of page numbers to parse. Included for legacy applications, use --page-numbers for more idiomatic argument entry.z	--objectsz-iz1Comma separated list of object numbers to extractz--allz-az3If the structure of all objects should be extractedz
--passwordz-P z,The password to use for decrypting PDF file.)r   r   r   OutputzUsed during output generation.z	--outfilez-o-zJPath to file where output is written. Or "-" (default) to write to stdout.z--raw-streamz-rz%Write stream objects without encodingz--binary-streamz-bz)Write stream objects with binary encodingz--text-streamz-tz"Write stream objects as plain text)r   __doc__add_argumentr   add_mutually_exclusive_groupadd_argument_groupint)rl   procedure_parserparse_paramsoutput_paramscodec_parsers        r   create_parserr      s   $?F
c4s>  @ 4|'  ) ::<!!u\+ " - !!d% " ' ,,7 - 9LsD?  A T   
 T@  B u\B  D db;  = --> . @MTS   ! !==?LeL4  6 4|8  : u\1  3 Mr   c                    t               }|j                  |       }|j                  r1t        j                         j                  t        j                         |j                  dk(  rt        j                  }nt        |j                  d      }|j                  r2|j                  j                  d      D cg c]  }t        |       }}ng }|j                  r|j                  D ch c]  }|dz
  	 }}nK|j                  r5|j                  j                  d      D ch c]  }t        |      dz
   }}n
t!               }|j"                  }t$        j&                  rCt        j(                  j*                  r)|j-                  t        j(                  j*                        }|j.                  rd}n |j0                  rd}n|j2                  rd}nd }|j4                  r	d }	t6        }
n'|j8                  r|j8                  }	t:        }
nd }	t<        }
|j>                  D ]  } |
||||||j@                  ||		        |jC                          y c c}w c c}w c c}w )
N)argsr   w,rS   r%   r&   r'   )rh   ri   r7   rj   )"r   
parse_argsdebuglogging	getLoggersetLevelDEBUGoutfilesysstdoutr\   objectssplitr   page_numbersrg   rC   rh   r   PY2stdinencodingdecode
raw_streambinary_streamtext_streamextract_tocru   extract_embeddedr   r   r   allrc   )argvrl   r   rd   xrf   rg   rh   r7   rj   procre   s               r   mainr   %  s   _F$'Dzz$$W]]3||s

T\\3'||"&,,"4"4S"9:Q#a&::"&"3"34Q1q544	'+||'9'9#'>?!3q6A:??%}}H
ww399%%??399#5#56						
			**

 CUE67XXXUz	CC 
KKMK ;
 5?s   9I
*I"I__main__rM   )r   FNN)-r   r   os.pathr   rer   argparser   r   pdfminer.pdfdocumentr   r   pdfminer.pdfpager   pdfminer.pdfparserr   pdfminer.pdftypesr   r	   r
   r   r   r   pdfminer.psparserr   r   r   pdfminer.utilsr   basicConfigcompiler   r"   r,   r@   rI   ru   r   r   r   r   r   r   __name__exit r   r   <module>r      s    )   	 
 # 
 ; $ ( > J J 7 7 #    
"**>
??:z( 9;6:1h z? >*  =?:> F 572685p2j zCHHTV r   