#] #] ********************* #] "$d_SysMaint"'Linux/pdftotext notes.txt' www.BillHowell.ca 20Jun2018 initial To view this file - use a text editor (not word processor) constant width font (eg courrier 10), tab - 3 spaces 48************************************************48 24************************24 # Table of Contents, see ToC file - not listed [here, screen] as it's too large! : # $ grep "^#]" "$d_SysMaint"'Linux/pdftotext notes.txt' | sed 's/^#\]/ /' ********************* "$d_SysMaint"'Linux/pdftotext notes.txt' 23Jul2022 Free online Hormonageddon revision1 - diff of changes 16Jun2021 my [cool, readable] setup of sed expressions!!! pdf_convertToTxt_path IS - res ipsa loquitor 20Jun2018 for de Jager's solar papers 24************************24 08********08 #] ??Jan2024 08********08 #] ??Jan2024 08********08 #] 30Dec2023 conversion of Davidson pdfs to text d_work="$d_PROJECTS"'9_My sports & clubs/Suspicious/' f1='Davidson 2023 Earth Disaster Cycle' pdftotext -layout "$d_work$f1.pdf" "$d_work$f1.txt" >> garbled results!! need updated or different software >> doesn't produce useful output, Ben Davidson has protected his pdfs well!! p82 Major White and Project Nanook- when the government knew p83 Coverup of catastrophe p84 The four horsemen of the apocalypse p85 The great waves of the catastrophe 08********08 #] 23Jul2022 Free online Hormonageddon revision1 - diff of changes coded in "$d_bin""diff pdftotext.sh" - run diff on pdf files : d_work="$d_webRawe"'References/Health/' f1='Sacha Dobler 07Sep2021 Hormonageddon - How Chemical and Electromagnetic Influences weakened the Human Character in three Generations' f2='Dobler 19Jun2022 revision1 Hormonageddon' pdftotext "$d_work$f1"'.pdf' "$d_temp$f1"'.txt' .. plus more code... 08********08 #] 16Jun2021 my [cool, readable] setup of sed expressions!!! # loaddefs link d_Qndfs 'email analysis - Fauci corona virus.ndf' IF flag_debug THEN write 'loading pdf_convertToTxt_path' ; ENDIF ; #] pdf_convertToTxt_path IS - res ipsa loquitor # 12Jun2021? initial, cool sed expression setup! pdf_convertToTxt_path IS { LOCAL finn fout p_temp ; NONLOCAL p_emailsPdf p_pdftotext sedExprFormat1 ; % ; p_temp := link d_temp 'pdf_convertToTxt_path temp.txt' ; % verify that paths are allowable ; IF (NOT AND (EACH path_exists ("p_old p_emailsPdf) ("p_new p_temp))) THEN EACH write '?pdf_convertToCleanTxt_path error, file unknown, one of : ' p_emailsPdf p_temp '' ; ELSE host link 'pdftotext "' p_emailsPdf '" "' p_temp '" | sed "' sedExprFormat1 '" "' p_temp '" >"' p_pdftotext '" ' ; ENDIF ; } # easy [view, edit] of sed expressions : # pdftotext frequent mis-recognitions : D=[D0], I= [1IJlTf], O=[O0] % First general formatting ; sedFormat1 := 'annoying formfeed' 's/ //' 'Privacy - remove email "contactName"' ';s/<\(.*\)@/<@/' 'DateFromToCCSubject reformat' (link ';s/D[ ]*a[ ]*t[ ]*e[ ]*:/Date:/I' ';s/S[ ]*e[ ]*n[ ]*t[ ]*:/Date:/I' ';s/F[ ]*r[ ]*o[ ]*m[ ]*:/From:/I' ';s/S[ ]*u[ ]*b[ ]*j[ ]*e[ ]*c[ ]*t[ ]*:/Subject:/I' ';s/T[ ]*o[ ]*:/To:/I' ) ; % organization acronyms ; sedExprOrgAcronym := '(NIH/CC/DLM)' ';s/(N[1IJlTf]H\/CC\/[D0]LM)/(NIH\/CC\/DLM)/' '(NIH/FIC)' ';s/(N[1IJlTf]H\/F[1IJlTf]C)/(NIH\/FIC)/' '(NIH/NCI)' ';s/(N[1IJlTf]H\/NC[1IJlTf])/(NIH\/NCI)/' '(NIH/OD)' ';s/(N[1IJlTf]H\/[O0][D0])/(NIH\/OD)/' '(NIH/VRC)' ';s/(N[1IJlTf]H\/VRC)/(NIH\/VRC)/' '(CDC/DDID/NCIRD/OD)' ';s/(C[D0]C\/[D0][D0][I1][D0]\/NCIRD\/[O0][D0])/(CDC\/DDID\/NCIRD\/OD)/' '(CDC/OD)' ';s/(C[D0]C\/[O0][D0])/(CDC\/OD)/' '(OS/IOS)' ';s/([O0]S\/[1IJlTf]0S)/(OS\/IOS)/' '(OS/ASPR/IO)' ';s/([O0]S\/ASPR\/[1IJlTf][O0])/(OS\/ASPR\/IO)/' ; sedExprAll := link sedFormat1 sedExprOrgAcronym ; n_cols := 2 ; n_rows := floor ((gage shape sedExprAll) / 2) ; tbl_sedFormat1 := n_rows n_cols reshape sedExprAll ; sedExprFormat1 := link second cols tbl_sedFormat1 ; % [quick, general] reformat ; sedExprToFromCC := 'extract lines of interest' 's/^From://I;s/^To://I;s/^CC://I' 'placeholder for newline' ';s/;/\\sed_n/g' 'get rid of multiple spaces' ';s/[ ]\+/ /g' 'get rid of spaces within ()' ';s/(\(.*\) \(.*\))/(\1\2)/g' 'problematic lineStart1' ';s/^ //g' 'problematic lineStart2' ';s/^\.\+//g' 'firstname tighten' ';s/ \, /\, /g' 'lastname tighten' ';s/, ((/\, (/g' 'delete title for alphaSort' ';s/Dr. //' ; n_cols := 2 ; n_rows := floor ((gage shape sedExprAll) / 2) ; tbl_sedToFromCC := n_rows n_cols reshape sedExprToFromCC ; sedExprToFromCC := link second cols tbl_sedToFromCC ; 08********08 #] 20Jun2018 for de Jager's solar papers $ pdftotext "/media/bill/HOWELL_BASE/Climate/References/de Jager 2008 - Solar activity and its influence on climate.pdf" \ "/media/bill/ramdisk/de Jager 2008 - Solar activity and its influence on climate.txt" # enddoc