"$d_Qndfs""dictionaries/dictionary notes.txt" www.BillHowell.ca 28Jun2021 initial see also +-----+ ToDos 16Jun2021 -disrupted words! - simple stats on associations, like Robert Hecht-Neilson, but damaged words but : higher-dimesions of context, calculus of words, fractional order calculus of words 27Jun2021 pTxt_pDic_extract_pfrag_pFragSubs I need a list of common [begin,end]ings of words [adjective, noun], [adverb, verb], [conjunctive, prepositions] Clean dictionaries --+-----------------------+--------------------------- | | | | | ----------------->- recombine | | | | | | V | | | | diff---> frags --->+------>+ breakup | | ^ | | | | | | | Dirty word list -----| V V | fixBlends fixSplits | V Manual collection of [good, new] words ----------->+-----------------> diff--> vaccine wordList ^ | | [url, document] sources of vaccine words ----------- diff : - perhaps cut off <2 chrs? - can always search smallFrags later 27Jun2021 pTxt_pDic_extract_pfrag_pFragSubs -> pTxt_make_pWrd I need to program smarter... - go back to pDwd_pDic_extract_pDif, tackle more efficiently! - Get rid of underscores? (not enough to worry about...) - drop frags <= 2 character - can check longer side of splits - don't merge dictionary & frags? - first, try dropping <2 chr frags, and underlines - more efficient handling of [start, end, plural, sex, etc] 29Jun2021 (later) to reduce [data, computation]s, remove pdic elements to length >1 ? ; 24************************24 08******08 01Jul2021 case of chrs ptxt has to ALL be in XOR[lower (normal), upper (acronym), InitialChr (name, start sentence)] case - or there are too many combos Best to have frags both in [original state, lower] case 08******08 01Jul2021 markup all frags in ptxt with "[||]" host link 'sed "s/' ^$(|$frag^$|)$ '/ (|' frag '|) /g" "' ptm1 '" >"' ptm2 '"' ; Example output while debugging : qnial> pEmails_doALL +-------------+ 210701 21h07m47s-> pTxt_pDic_fixFrags : 210701 21h07m47s-> generate a sorted list of words from the text 210701 21h07m49s-> diff pdic and pwrd, sort to pfrg +----------------------+-----+ |(gage shape fragL) = |23085| +----------------------+-----+ 210701 21h07m53s-> markup all frags in ptxt with "[||]" 210701 21h22m23s-> sed frag sequences of (1-5) frags lengths 210701 21h22m23s-> grep frag sequences of (1-5) frags lengths grep: /media/bill/ramdisk/grep5.txt: No such file or directory grep: /media/bill/ramdisk/grep4.txt: No such file or directory grep: /media/bill/ramdisk/grep3.txt: No such file or directory grep: /media/bill/ramdisk/grep2.txt: No such file or directory +-------------+ 210701 21h22m23s-> pTxt_removeAdd_spaces : 08******08 01Jul2021 "invert" raweSupFragL to findSubFragL pTxt_pDic_extract_pfrag_pFragSubs change : +.....+ write link timestamp_YYMMDD_HMS '-> "invert" raweSupFragL to findSubFragL' ; findFragL := raweFragL ; findSubFragL := n_raweFragL reshape [null] ; i := 0 ; WHILE (n_raweFragL > i) DO FOR supFrag WITH raweSupFragL@i DO IF (NOT isfault (j := find_Howell supFrag findFragL)) THEN findSubFragL@j := findSubFragL@j link supFrag ; ENDIF ; ENDFOR ; i := i + 1 ; ENDWHILE ; n_findFragL findFragL findSubFragL := fragL_subFragL_getNonNull 'find' findFragL findSubFragL ; % I want file output for "find" as well - bug hunts ; fout := open pfnd "w ; FOR i WITH (tell (gage shape findFragL)) DO fout writefile (link '<<:| ' (first findFragL@i)) ; fout EACHRIGHT writefile findSubFragL@i ; ENDFOR ; close fout ; +.....+ To : +.....+ write link timestamp_YYMMDD_HMS '-> "invert" raweSupFragL to findSubFragL' ; findFragL := raweFragL ; findSubFragL := n_raweFragL reshape [null] ; i := 0 ; WHILE (n_raweFragL > i) DO FOR supFrag WITH raweSupFragL@i DO IF (NOT isfault (j := find_Howell supFrag findFragL)) THEN findSubFragL@j := findSubFragL@j link supFrag ; ENDIF ; ENDFOR ; i := i + 1 ; ENDWHILE ; n_findFragL findFragL findSubFragL := fragL_subFragL_getNonNull 'find' findFragL findSubFragL ; % I want file output for "find" as well - bug hunts ; fout := open pfnd "w ; FOR i WITH (tell (gage shape findFragL)) DO fout writefile (link '<<:| ' (first findFragL@i)) ; fout EACHRIGHT writefile findSubFragL@i ; ENDFOR ; close fout ; +.....+ +-----+ qnial> loaddefs link d_Qndfs 'emails - convert pdfCompilation to text.ndf' qnial> pEmails_doALL qnial> loaddefs link d_Qndfs 'dictionaries.ndf' qnial> pTxt_make_pWrd_test optr change : +.....+ +.....+ To : +.....+ +.....+ +-----+ olde code # pDicInn_removeApoLines_pDicOut p_dicUSA_Linux p_dicUSA_noApos # $ grep --invert-match "*\'*" "/usr/share/dict/american-english" >"$d_Qndfs""dictionary Linux american-english noApos.txt" # $ grep --invert-match "\o047" "/media/bill/Dell2/Website - raw/Pandemics, health, and the Sun/corona virus/Fauci covid emails/test dic.txt" # $ cat "/media/bill/Dell2/Website - raw/Pandemics, health, and the Sun/corona virus/Fauci covid emails/test dic.txt" $ grep --invert-match "a" "/media/bill/Dell2/Website - raw/Pandemics, health, and the Sun/corona virus/Fauci covid emails/test dic.txt" $ grep --invert-match "'" "/media/bill/Dell2/Website - raw/Pandemics, health, and the Sun/corona virus/Fauci covid emails/test dic.txt" # 28Jun2021 - maybe not so useful, simple ones just copy-paste with manual cleanup, big ones dic-specific optrs # cleanup can be tedious, advantage to special optrs each webPage # loaddefs link d_Qndfs 'dictionaries.ndf' IF flag_debug THEN write 'loading urlL_make_pDic' ; ENDIF ; #] urlL_make_pDic IS OP urlL pdic - create a wordL from a list of urls urlL_make_pDic IS OP urlL pdic { LOCAL strQuotedListOfDicL ; IF (NAND (AND (EACHRIGHT path_exists "p_old urlL)) (path_exists "p_new pdic) ) THEN write '?urlL_make_pDic error, file unknown, one of : ' EACH write urlL ; EACH write pdic '' ; ELSE ???? strQuotedListOfDicL := (' "' EACHRIGHT link urlL) EACHLEFT link '" ' ; host link 'cat ' strQuotedListOfDicL ' | sort -u >>"' pdic '" ' ; ENDIF ; } 08******08 29Jun2021 pDicAll use in pTxt_pDic_extract_pfrag_pFragSubs Take this out! : write link timestamp_YYMMDD_HMS '-> merge pdic and pwrd, sort to pdwd : ' ; pDicL_merge_pDic (pwrd pdic) pdwd ; strL := strList_readFrom_path pdwd ; Put back in! : write link timestamp_YYMMDD_HMS '-> diff pdic and pwrd, sort to pdiff : ' ; % find lines of SORTED pwrd not in pdic, 29Jun2021 cd "[_]\+" to "_", might be a problem ; % to reduce [data, computation]s, remove frags of length 1 ; host link 'diff "' pdic '" "' pwrd '" | grep "^>" | sed "s/^> //" | grep --invert-match "_" | grep --invert-match "[0-9]\+" | grep --invert-match "[A-Za-z]\?" | sort -u -f >"' pdif '"' ; strL := strList_readFrom_path pdif ; pTxt_pDic_extract_pfrag_pFragSubs change : +.....+ host link 'diff "' pdic '" "' pwrd '" | grep "^>" | sed "s/^> //" | grep --invert-match "_" | grep --invert-match "[0-9]\+" | grep --invert-match "[A-Za-z]\?" | sort -u -f >"' pdif '"' ; +.....+ To : +.....+ host link 'diff "' pdic '" "' pwrd '" | grep "^>" | sed "s/^> //" | grep --invert-match "_" | grep --invert-match "[0-9]\+" | sort -u -f >"' pdif '"' ; +.....+ Change again to : host link 'diff "' pdic '" "' pwrd '" | grep "^>" | sed "s/^> //" | grep --invert-match "_" | grep --invert-match "[0-9]\+" | grep --invert-match "$[A-Za-z]$" | sort -u -f >"' pdif '"' ; +-----+ It might be handy to have counts for frags sortedc by length - at what cuff is there a major benefit computationally? word lengths (dictionary list) are the key, the frag lengths? pTxt_pDic_extract_pfrag_pFragSubs change : +.....+ write link timestamp_YYMMDD_HMS '-> diff pdic and pwrd, sort to pdiff : ' ; % find lines of SORTED pwrd not in pdic, 29Jun2021 cd "[_]\+" to "_", might be a problem ; % 29Jun2021 (later) to reduce [data, computation]s, remove pdic elements to length >1 ? ; host link 'diff "' pdic '" "' pwrd '" | grep "^>" | sed "s/^> //" | grep --invert-match "_" | grep --invert-match "[0-9]\+" | grep --invert-match "$[A-Za-z]$" | sort -u -f >"' pdif '"' ; strL := strList_readFrom_path pdif ; +.....+ To : +.....+ +.....+ Leave it for now, go back to getting pTxt_pDic_extract_pfrag_pFragSubs to work... 08********08 29Jun2021 search "Tom Cobb Montreal linguistics" https://www.lextutor.ca/cv/ https://lextutor.ca/ https://www.lextutor.ca/cv/index.html#Tutorial Yoshua Bengio +-----+ https://www.youtube.com/watch?v=fpdq14jeHQs&feature=youtu.be lextutor@dade 6,042 views Oct 5, 2013 Tom Cobb 63 subscribers A preview of Tom's Lextutor workshops at Dade College Miami, 2013 Oct 9-12 hypertext builder - computer speech so learners read & listen MultiConc (concordance) output which word fits multiple sentences best? Group Lex - learned words by a group, top 10 qurestions etc Vocabulary profile - test learners level and select training material 08********08 28Jun2021 add more sources for vaccine dictionary already have : "$d_Qndfs""dictionary - CDC vaccine glossaries.txt" https://www.cdc.gov/vaccines/terms/glossary.html CDC - Centers for Disease Control and Prevention search "Vaccine dictionary online" https://www.dictionary.com ~18,000 just for intro-symbols and letter A!! I need to write a script for this 08******08 27Jun2021 pTxt_make_pWrd - create a vaccine dictionary (list of words) % open text editor to capture webPage text-only (not html code) ; host link 'geany -i "' pTxt '"' ; winTxtID := host_result link 'wmctrl -l | grep "' winTitle '" | tail -1 | cut -f1 -d" " ' ; write link 'winTxtID = ' winTxtID ; # 28Jun2021 example of copy-paste from window link d_Qndfs 'economics, markets/options data [download, process].ndf' % open yahoo financial options URL in browser, code from "$d_bin""starter/start_Markets.sh" ; cmd := link 'bash "' d_start '""start_app.sh" 3s "browser" "' url '" ""' ; write cmd ; host cmd ; host 'sleep 5s' ; winID_yahooOptions := winTitle_get_winID winNm_yahooOptions ; host 'sleep 3s' ; write 'yahooOptions_downLoad, winID_yahooOptions = ' winID_yahooOptions ; winIDorName_copyAllTextTo_pout winID_yahooOptions p_callPutts ; host link 'xdotool key --window ' winID_yahooOptions ' ctrl+shift+w' ; host 'sleep 3s' ; host link 'sort -u "' pDicCom '" >"' p_temp2 '"' ; host link 'mv "' p_temp2 '" "' pDicCom '"' ; % remove junk lines ; host link 'grep --invert-match "??\|??" "' '" | sed "^ " "' pDicCom '" >"' p_temp2 '"' ; host link 'mv "' p_temp2 '" "' pDicCom '"' ; % remove white[^chr, line]s ; host link 'sed "^ " "' pDicCom '" >"' p_temp2 '"' ; host link 'mv "' p_temp2 '" "' pDicCom '"' ; 08******08 27Jun2021 pTxt_pDic_extract_pfrag_pFragSubs -> pTxt_make_pWrd No output?? I need a list of common [begin,end]ings of words [adjective, noun], [adverb, verb], [conjunctive, prepositions] Ok,after much chasing of my tail, it works up to write link timestamp_YYMMDD_HMS '-> read praw, build rawe[, Sup]FragL :' ; That produced 53 Mb, 5,359,179 lines of file output earlier in the afternoon, but seems to get stuck in a loop? +--+ qnial> pEmails_doALL +-------------+ 210627 14h18m34s-> pTxt_pDic_extract_pfrag_pFragSubs : 210627 14h18m34s-> generate a sorted list of wordFrags : 210627 14h23m09s-> merge pdic and pwrd, sort to pdwd : 210627 14h25m01s-> read pdwd, - write [, sub]FragL to prawe : rm: cannot remove '/media/bill/ramdisk/extract_pFragsAndSubs raweFrags.txt': No such file or directory 210627 14h33m15s-> read praw, build rawe[, Sup]FragL : expanding heap to 38400000 words expanding heap to 46080000 words expanding heap to 55296000 words expanding heap to 66355200 words expanding heap to 79626240 words +--------+-------+----------+ |fragName|n_frags|n_subFrags| +--------+-------+----------+ |rawe | 135521| 135521| +--------+-------+----------+ 210627 14h40m02s-> "invert" raweSupFragL to findSubFragL : +--------+-------+----------+ |fragName|n_frags|n_subFrags| +--------+-------+----------+ |find | 97769| 97769| +--------+-------+----------+ 210627 17h42m41s-> "right lengthPairs" : ^C +--+ >> This may be choking!? I need to program smarter... - Get rid of underscores? (not enough to worry about...) - drop frags <= 2 character - can check longer side of splits - don't merge dictionary & frags? - try dropping <2 chr frags first +-----+ olde code # originally couldn't get to work - was EACHRIGHT in conditional, now OK % host 'bash "$d_bin""emails - convert pdfCompilation to text, list all words.sh" ' ; % host link 'grep -o -E ' chr_apo '\w+' chr_apo ' "' ptxt '" | grep --invert-match "[0-9]\+" | sort -u >"' pwrd '"' ; % host 'grep -o -E "\w+" "/media/bill/ramdisk/1stClean temp.txt" | grep --invert-match "[0-9]\+" | sort -u >"/media/bill/ramdisk/extract_pFragsAndSubs words.txt"' ; 08******08 27Jun2021 pTxt_pDic_extract_pfrag_pFragSubs -> pTxt_make_pWrd No output?? pdwd := link d_temp 'extract_pFragsAndSubs dicWrds.txt' ; praw := link d_temp 'extract_pFragsAndSubs raweFrags.txt' ; pwrd := link d_temp 'extract_pFragsAndSubs words.txt' ; create : #] pTxt_make_pWrd_test IS - res ipsa loquitor pTxt_make_pWrd_test IS { LOCAL p_1stClean pWrd ; NONLOCAL d_temp ; p_1stClean := link d_temp '1stClean temp.txt' ; pwrd := link d_temp 'extract_pFragsAndSubs words.txt' ; pTxt_make_pWrd p_1stClean pWrd ; } Hmm, must be format of grep : optr change : +.....+ host link 'grep -o -E "\w+" "' ptxt '" | grep --invert-match "[0-9]\+" | sort -u -f >"' pwrd '"' ; +.....+ To : +.....+ host link 'grep -o -E ' chr_apo '\w+' chr_apo '" ptxt '" | grep --invert-match "[0-9]\+" | sort -u >"' pwrd '"' ; +.....+ >> Nope. Why has this stopped working? Try direct bash command : $ grep -o -E '\w+' "/media/bill/ramdisk/1stClean temp.txt" | grep --invert-match "[0-9]\+" | sort -u >"/media/bill/ramdisk/extract_pFragsAndSubs words.txt" >> OK, it worked But I need QNial wrapper to work!! change again +.....+ host link 'grep -o -E ' chr_apo '\w+' chr_apo ' "' ptxt '" | grep --invert-match "[0-9]\+" | sort -u >"' pwrd '"' ; +.....+ qnial> link 'grep -o -E ' chr_apo '\w+' chr_apo ' "/media/bill/ramdisk/1stClean temp.txt" | grep --invert-match "[0-9]\+" | sort -u >"/media/bill/ramdisk/extract_pFragsAndSubs words.txt"' grep -o -E '\w+' "/media/bill/ramdisk/1stClean temp.txt" | grep --invert-match "[0-9]\+" | sort -u >"/media/bill/ramdisk/extract_pFragsAndSubs words.txt" >> looks good!?...same as bash!! grep -o -E '\w+' "/media/bill/ramdisk/1stClean temp.txt" | grep --invert-match "[0-9]\+" | sort -u >"/media/bill/ramdisk/extract_pFragsAndSubs words.txt" Very interesting failure - why? Try host 'grep -o -E "\w+" "/media/bill/ramdisk/1stClean temp.txt" | grep --invert-match "[0-9]\+" | sort -u >"/media/bill/ramdisk/extract_pFragsAndSubs words.txt"' ; >> still does nothing really fast Try modified (quote, apo) direct bash command : $ grep -o -E "\w+" "/media/bill/ramdisk/1stClean temp.txt" | grep --invert-match "[0-9]\+" | sort -u >"/media/bill/ramdisk/extract_pFragsAndSubs words.txt" see : $ bash "$d_bin""emails - convert pdfCompilation to text, list all words.sh" #] pTxt_make_pWrd IS OP pTxt pWrd - generate a sorted list of words from a text file pTxt_make_pWrd IS OP pTxt pWrd { IF (NAND (EACHRIGHT path_exists ("p_old pTxt) ("p_new pWrd))) THEN EACH write '?pTxt_make_pWrd error, file unknown, one of : ' pTxt pWrd ; ELSE host 'bash "$d_bin""emails - convert pdfCompilation to text, list all words.sh" ' ; ENDIF ; } OK -now 'extract_pFragsAndSubs words.txt' is created qnial> pEmails_doALL +-------------+ 210627 12h24m42s-> pTxt_pDic_extract_pfrag_pFragSubs : 210627 12h24m42s-> generate a sorted list of wordFrags : 210627 12h29m00s-> merge pdic and pwrd, sort to pdwd : >> but this fails : pdwd := link d_temp 'extract_pFragsAndSubs dicWrds.txt' ; #] pDicL_merge_pDic_test IS - res ipsa loquitor pDicL_merge_pDic_test IS { LOCAL pdic pDicL ; NONLOCAL d_Fauci d_temp ; pDicL := (link d_temp 'extract_pFragsAndSubs dicWrds.txt') (link d_Fauci '7_Fauci dictionary.txt') ; pdic := link d_temp 'pDicL_merge_pDic_test result.txt' ; pDicL_merge_pDic pDicL pdic ; } It looks like I used EACHRIGHT instead of EACH? IF (NAND (EACH path_exists ("p_old pTxt) ("p_new pWrd))) 08******08 26Jun2021 pTxt_pDic_extract_pfrag_pFragSubs - get it working! +-----+ qnial> pEmails_doALL 210626 16h14m28s-> pdf_convertTo_txt : 210626 16h14m35s-> pEmails_1stclean_pout : sed: -e expression #1, char 62: unknown option to `s' 210626 16h14m35s-> pTxt_pDic_extract_pfrag_pFragSubs : +-------------+ pTxt_pDic_extract_pfrag_pFragSubs steps : 210626 16h14m35s-> generate a sorted list of wordFrags 210626 16h14m35s-> merge pdic and pwrd, sort to pdwd 210626 16h14m35s-> read pdwd, - write [, sub]fragLines to prawe rm: cannot remove '/media/bill/ramdisk/extract_pFragsAndSubs raweFrags.txt': No such file or directory 210626 16h14m35s-> read prawe, build raweFragL raweSupFragL wc: '/media/bill/ramdisk/extract_pFragsAndSubs raweFrags.txt': No such file or directory +--------+-------+----------+ |fragName|n_frags|n_subFrags| +--------+-------+----------+ |rawe | 1| 1| +--------+-------+----------+ 210626 16h14m35s-> "invert" raweSupFragL to findSubFragL +--------+-------+----------+ |fragName|n_frags|n_subFrags| +--------+-------+----------+ |find | 1| 1| +--------+-------+----------+ 210626 16h14m35s-> "right lengthPairs" [, n_,]find[,, Sub]FragL to ...pair... +--------+-------+----------+ |fragName|n_frags|n_subFrags| +--------+-------+----------+ |pair | 0| 0| +--------+-------+----------+ 210626 16h14m35s-> strL_write_pout [, sub]FragL +-------------++ |strLen_max = || +-------------++ +-------------+ 210626 16h14m35s-> pTxt_removeAdd_spaces : bypass sed_formatHeads, remove from pEmails_1stclean_pout : host link 'sed "' sed_formatHeads '" "' p_messyStuffs '" >>"' p_orgAcronym1 '" ' ; qnial> sed_messyStuffs change : +.....+ s/<$.*$@/<@/;s/<@[ ]*$.*$>/<@\1>/;s/[!:-_=~]\{5,\}// +.....+ To : +.....+ s/<$.*$@/<@/;s/<@[ ]*$.*$>/<@\1>/;s/$.*$[!:-_=~]\{5,\}$.*$/\1\2/ +.....+ qnial> sed_messyStuffs@55 / p_orgAcronym1 change : +.....+ qnial> sed1_orgAcronym change : +.....+ s/[({]N[I1JlTf]H\/N[I1JlTf]A[I1JlTf]O[D0][)}]/(NIH\/NIAIO);s/[({]N[I1JlTf]H\/CC\/[D0]LM[)}]/(NIH\/CC\/DLM)/;s/[({]N[I1JlTf]H\/F[I1JlTf]C[)}]/(NIH\/FIC)/ +.....+ To : +.....+ s/[({]N[I1JlTf]H\/N[I1JlTf]A[I1JlTf]O[D0][)}]/(NIH\/NIAIO)/;s/[({]N[I1JlTf]H\/CC\/[D0]LM[)}]/(NIH\/CC\/DLM)/;s/[({]N[I1JlTf]H\/F[I1JlTf]C[)}]/(NIH\/FIC)/ +.....+ First `( in s/[({]N[I1JlTf]H\/CC\/[D0]LM[)}] is the problem? missing terminating `/ previous sedExp! 210626 17h03m47s-> sed_messyStuffs : sed: -e expression #1, char 56: Invalid range end take out : 'strange sequences of punctuation' ';s/[!:-_=~]\{5,\}//' now pEmails_1stclean_pout seesm to work OK : p_messyStuffs p_orgAcronym1 p_orgAcronym2 p_orgAcronym3 qnial> loaddefs link d_Qndfs 'emails - convert pdfCompilation to text.ndf' qnial> pEmails_doALL IS - res ipsa loquitor 210626 17h05m55s-> read prawe, build raweFragL raweSupFragL wc: '/media/bill/ramdisk/extract_pFragsAndSubs raweFrags.txt': No such file or directory 210626 19h24m22s-> read pdwd, - write [, sub]FragL to prawe : rm: cannot remove '/media/bill/ramdisk/extract_pFragsAndSubs raweFrags.txt': No such file or directory 210626 19h24m22s-> read prawe, build rawe[, Sup]FragL : wc: '/media/bill/ramdisk/extract_pFragsAndSubs raweFrags.txt': No such file or directory +--------+-------+----------+ |fragName|n_frags|n_subFrags| +--------+-------+----------+ |rawe | 1| 1| +--------+-------+----------+ 210626 19h24m22s-> "invert" raweSupFragL to findSubFragL : +--------+-------+----------+ |fragName|n_frags|n_subFrags| +--------+-------+----------+ |find | 1| 1| +--------+-------+----------+ 210626 19h24m22s-> "right lengthPairs" : +--------+-------+----------+ |fragName|n_frags|n_subFrags| +--------+-------+----------+ |pair | 0| 0| +--------+-------+----------+ 210626 19h24m22s-> strL_write_pout [, sub]FragL : +-------------++ |strLen_max = || +-------------++ +-------------+ 210626 19h24m22s-> pTxt_removeAdd_spaces : >> Hmm, why isn't praw created!!??? Before praw, OK p_1stClean (huge!) none created [pWrd, pdic, pdwd] Check : pTxt_make_pWrd pTxt pWrd ; pDicL_merge_pDic (pwrd pdic) pdwd ; NYET!this is the proper symbol : pEmails_doALL change : +.....+ pTxt_pDic_extract_pfrag_pFragSubs p_1stClean p_dictionary p_fragL p_subFragL ; +.....+ To : +.....+ pTxt_pDic_extract_pfrag_pFragSubs p_1stClean p_dic p_fragL p_subFragL ; +.....+ >> This should have signalled a loaddef error? net - was correct symbol Blank out part of pEmails_doALL - these seem OK % write link timestamp_YYMMDD_HMS % '-> pdf_convertTo_txt : ' ; % pdf_convertTo_txt p_emailsPdf p_pdftotext ; % write '+-------------+' ; % write link timestamp_YYMMDD_HMS % '-> pEmails_1stclean_pout : ' ; % pEmails_1stclean_pout p_pdftotext p_1stClean ; Hah! I had commented out the operational line and forgot! write link 'cat ' strQuotedListOfDicL ' | sort -u >>"' pdic '" ' ; % host link 'cat ' strQuotedListOfDicL ' | sort -u >>"' pdic '" ' ; +-----+ qnial> loaddefs link d_Qndfs 'emails - convert pdfCompilation to text.ndf' qnial> pEmails_doALL 08******08 25Jun2021 rebuild from scratch pTxt_pDic_extract_pfrag_pFragSubs will likely have to re-grade other optrs! 25Jun2021 Major [file, backup]s loss!! mostly seems to be recent Fauci-related files grep froze with memory limitations last night backups hadn't been woring sin 17:00 22Jun2021 - not warning why? how? I've never seen this before << I forgot where I put the files!! everyting is OK 08********08 24Jun2021 str_subStrs_getLenMatches_subStrPairs IS OP str subStrs - return (str = link subStrPairs) >> it's not catching ANY legitimate strs!?? try comparing against dictionary pTxt_pDic_extract_pfrag_pFragSubs change : +.....+ % find lines of SORTED pwrd not in pdic ; host link 'diff "' pdic '" "' pwrd '" | grep "^>" | grep --invert-match "[_]\+" | sed "s/^> //" | sort -u -f >"' pdif '"' ; strL := strList_readFrom_path pdif ; % ; % grep isn't saving by ignoring diffs longer than subs, but has great [speed, flexibility] ; write link timestamp_YYMMDD_HMS '-> prawe read pdif - write to prawe' ; prawe := link d_temp 'extract_pFragsAndSubs raweFrags.txt' ; % zero out prawe so that it doesn't accumulate ; host link 'rm "' prawe '" ' ; FOR str WITH strL DO host link 'echo "<<:| " "' str '" >>"' prawe '"' ; host link 'grep "' str '" "' pdif '" >>"' prawe '"' ; ENDFOR ; +.....+ To : +.....+ % shortcut of pdif (frags not in pdic) ; % host link 'diff "' pdic '" "' pwrd '" | grep "^>" | grep --invert-match "[_]\+" | sed "s/^> //" | sort -u -f >"' pdif '"' ; strL := strList_readFrom_path ptxt ; % ; % grep isn't saving by ignoring diffs longer than subs, but has great [speed, flexibility] ; write link timestamp_YYMMDD_HMS '-> prawe read pdif - write to prawe' ; prawe := link d_temp 'extract_pFragsAndSubs raweFrags.txt' ; % zero out prawe so that it doesn't accumulate ; host link 'rm "' prawe '" ' ; FOR str WITH strL DO host link 'echo "<<:| " "' str '" >>"' prawe '"' ; host link 'grep "' str '" "' ptxt '" >>"' prawe '"' ; ENDFOR ; +.....+ +-----+ olde code % to reduce testing, start by ONLY considering pairs whose lengths < equal that of the frag ; % 24Jun2021 str_subStrs_getLenMatches_subStrPairs isn't catching ANY legitimate strs!?? ; % try comparing against dictionary ; write link timestamp_YYMMDD_HMS '-> "rightLen pairs" [, n_,]find[,, Sub]FragL to [, n_,]pair[,, Sub]FragL' ; pairFragL := findFragL ; pairSubFragL := n_findFragL reshape [null] ; i := 0 ; WHILE (n_findFragL > i) DO pairSubFragL@i := str_subStrs_getLenMatches_subStrPairs pairFragL@i findSubFragL@i ; i := i + 1 ; ENDWHILE ; n_pairFragL pairFragL pairSubFragL := fragL_subFragL_getNonNull 'pair' pairFragL pairSubFragL ; 08********08 21Jun2021 dictionaries [strL_write_fout, listOfStrL_to_strExecuteMirror, listOfStrL_write_fout] My search SUCKS!! way to slow & inefficient pTxt_pDic_extract_pfrag_pFragSubs change : +.....+ write link timestamp_YYMMDD_HMS '-> chop null components of [rawFragL, supFragL, allSupFragL]' ; rawFragL supFragL allSupFragL := n_rawFragL EACHRIGHT take rawFragL supFragL allSupFragL ; write 'EACH (gage shape) rawFragL supFragL allSupFragL = ' (EACH (gage shape) rawFragL supFragL allSupFragL) ; % ; write link timestamp_YYMMDD_HMS '-> "invert" allSupFragL to allSubFragL' ; allSubFragL := n_rawFragL reshape [null] ; i := 0 ; WHILE (i < n_rawFragL) DO j := 0 ; WHILE (j < n_rawFragL) DO IF (OR (fragL@i EACHRIGHT = allSupFragL@(j))) THEN allSubFragL@i := allSubFragL@i append fragL@i ; ENDIF ; j := j + 1 ; ENDWHILE ; i := i + 1 ; % write link timestamp_YYMMDD_HMS ', i = ' (string i) ; ENDWHILE ; +.....+ To : +.....+ write link timestamp_YYMMDD_HMS '-> chop null components of [rawFragL, supFragL, allSupFragL]' ; rawFragL supFragL allSupFragL := n_rawFragL EACHRIGHT take rawFragL supFragL allSupFragL ; write 'EACH (gage shape) rawFragL supFragL allSupFragL = ' (EACH (gage shape) rawFragL supFragL allSupFragL) ; % ; write link timestamp_YYMMDD_HMS '-> "invert" allSupFragL to allSubFragL' ; allSubFragL := n_rawFragL reshape [null] ; i := 0 ; WHILE (i < n_rawFragL) DO FOR frag WITH allSupFragL@(i) DO IF (OR (isfault (j := find_Howell frag fragL) ))) THEN allSubFragL@j := allSubFragL@j append frag ; ELSE write link '?pTxt_pDic_extract_pfrag_pFragSubs error, frag unknown : ' frag ; ENDIF ; ENDFOR ; i := i + 1 ; % write link timestamp_YYMMDD_HMS ', i = ' (string i) ; ENDWHILE ; +.....+ BIG OOPS!! pTxt_pDic_extract_pfrag_pFragSubs change : +.....+ fragL subFragL := not_empty EACHRIGHT sublist fragL allSubFragL ; +.....+ To : +.....+ fragL subFragL := not_empty EACHRIGHT sublist rawfragL allSupFragL ; +.....+ +-----+ olde code IF flag_debug THEN write 'loading listOfStrL_to_strExecuteMirror' ; ENDIF ; #] strL_to_strExecuteMirror IS OP strL - convert strL to a self-return executable # www.BillHowell.ca 24Jun2021 initial # must be something in QNial?? # could be generalised to a mix of [str, optr]s ... sometime in distant future? program [segement,evolve]s # example : d_Qndfs 'dictionaries.ndf' -> pTxt_pDic_extract_pfrag_pFragsAndSubs strL_to_strExecuteMirror IS OP strL { (link (link ('(' chr_apo EACHRIGHT link (listOfStrL EACHLEFT link chr_apo ')')))) } IF flag_debug THEN write 'loading strL_write_pout' ; ENDIF ; #] strL_write_pout IS OP strL pout - write a list of strings to pout # www.BillHowell.ca 18Sep2018 initial strL_write_pout IS OP strL pout { LOCAL fout ; fout := open pout "w ; fout EACHRIGHT writefile strL ; close fout ; } list_writeTo_path IS strL_write_pout strList_writeTo_path IS strL_write_pout c01 := chr_apo 'bear' chr_apo ' ' chr_apo 'bull' chr_apo ' ' chr_apo 'pig' chr_apo ' ' chr_apo 'wolf' chr_apo ; c02 := chr_apo 'bear' chr_apo ' ' chr_apo 'bull' chr_apo ' ' chr_apo 'pig' chr_apo ' ' chr_apo 'wolf' chr_apo ; c03 := chr_apo 'bear' chr_apo ' ' chr_apo 'bull' chr_apo ' ' chr_apo 'pig' chr_apo ' ' chr_apo 'wolf' chr_apo ; c04 := chr_apo 'bear' chr_apo ' ' chr_apo 'bull' chr_apo ' ' chr_apo 'pig' chr_apo ' ' chr_apo 'wolf' chr_apo ; c05 := chr_apo 'bear' chr_apo ' ' chr_apo 'bull' chr_apo ' ' chr_apo 'pig' chr_apo ' ' chr_apo 'wolf' chr_apo ; c06 := chr_apo 'bear' chr_apo ' ' chr_apo null chr_apo ' ' chr_apo 'pig' chr_apo ' ' chr_apo null chr_apo ; % grep isn't saving by ignoring diffs longer than subs, but has great [speed, flexibility] ; write link timestamp_YYMMDD_HMS '-> praw write' ; host link 'rm "' praw '" ' ; FOR str WITH strL DO host link 'echo "<<:| " "' str '" >>"' praw '"' ; host link 'grep "' str '" "' pdif '" >>"' praw '"' ; ENDFOR ; % ; i := 0 ; WHILE (~= null rawFragL@i) DO i := 1 + 1 ; ENDWHILE ; n_rawFragL := i - 1 ; % create QNial list of frags and and "super-frags", not in the dictionary ; % use of the diffL first may allow me to derive "plausible" new words specific to the domain? ; % should use grep again!! -w --line-number ; write link timestamp_YYMMDD_HMS '-> rawFragL supFrags allSupFragL' ; rawFragL supFragL allSupFragL := dicMaxWords EACHRIGHT reshape [null] [null] [null] ; supFrags := null ; i := 0 ; finn := open praw "r ; WHILE (NOT isfault (line := readfile finn)) DO IF ('<<:| ' = (6 take line)) THEN rawFragL@i := 6 drop line ; IF (0 ~= i) THEN allSupFragL@(i - 1) := supFrags ; ENDIF ; supFrags := null ; ELSE supFrags := supFrags append line ; ENDIF ; i := i + 1 ; ENDWHILE ; close finn ; n_rawFragL := i ; write '(gage shape) rawFragL = ' n_rawFragL ; % capture last sub ; allSupFragL@(i - 1) := supFrags ; 08********08 21Jun2021 dictionaries, pTxt_pDic_extract_pFragsAndSubs - load fragsAndSubFrags OK - worked all day on [upgrades, separate dictionary in ndf, pTxt_pDic_extract_pfrag_pFragSubs, etc] qnial> loaddefs link d_Qndfs 'emails - convert pdfCompilation to text.ndf' >> OK qnial> pEmails_doALL nDat_indxsSumToNdat_get_ij IS OP nDat sumr { LOCAL iL indxs ; IF flag_break THEN BREAK ; ENDIF ; iL := tell nDat ; indxs := link (iL EACHLEFT EACHRIGHT pair (reverse iL)) ; (sumr EACHRIGHT = (EACH sum indxs)) sublist indxs } #] nDat_indxsSumToNdat_get_ij IS OP numL num - returns indices of numL, sum(numL@(i j)) = num # 20Jun2021 initial # separate optr to facilitate testing of pTxt_removeAdd_spaces, afor potential later general use # indxs include "reverses" - handy for testing frag catenations later nDat_indxsSumToNdat_get_ij IS OP nDat sumr { LOCAL iL indxs ; IF flag_break THEN BREAK ; ENDIF ; iL := tell nDat ; indxs := link (iL EACHLEFT EACHRIGHT pair (reverse iL)) ; (sumr EACHRIGHT = (EACH sum indxs)) sublist indxs } # nyet this - may have repeat frags! not_doubles := EACH ~= (EACH [first, last] indexs)) ; IF flag_debug THEN write 'loading nDat_indxsSumToNdat_get_ij_test' ; ENDIF ; #] nDat_indxsSumToNdat_get_ij_test IS - res ipsa loquitor # 20Jun2021 initial nDat_indxsSumToNdat_get_ij_test IS { LOCAL n_cols n_rows paths_tbl test_tbl c01 c02 c03 c04 c05 c06 c07 d01 d02 d03 d04 d05 d06 d07 s01 s02 s03 s04 s05 s06 s07 t01 t02 t03 t04 t05 t06 t07 ; % ; optr := 'nDat_indxsSumToNdat_get_ij' ; % ; c01 := 10 ; d01 := 3 ; c02 := 10 ; d02 := 5 ; c03 := 10 ; d03 := 10 ; c04 := 20 ; d04 := 15 ; c05 := 20 ; d05 := 5 ; c06 := 20 ; d06 := 15 ; % ; % careful with reverse lists - the results SHOULD give indices, NOT the numeric values!!! ; s01 := (0 3) (1 2) (2 1) (3 0) ; s02 := (0 5) (1 4) (2 3) (3 2) (4 1) (5 0) ; s03 := (1 9) (2 8) (3 7) (4 6) (5 5) (6 4) (7 3) (8 2) (9 1) ; s04 := (0 15) (1 14) (2 13) (3 12) (4 11) (5 10) (6 9) (7 8) (8 7) (9 6) (10 5) (11 4) (12 3) (13 2) (14 1) (15 0) ; s05 := (0 5) (1 4) (2 3) (3 2) (4 1) (5 0) ; s06 := (0 15) (1 14) (2 13) (3 12) (4 11) (5 10) (6 9) (7 8) (8 7) (9 6) (10 5) (11 4) (12 3) (13 2) (14 1) (15 0) ; % ; t01 := link optr ' c01 d01' ; r01 := execute t01 ; t02 := link optr ' c02 d02' ; r02 := execute t02 ; t03 := link optr ' c03 d03' ; r03 := execute t03 ; t04 := link optr ' c04 d04' ; r04 := execute t04 ; t05 := link optr ' c05 d05' ; r05 := execute t05 ; t06 := link optr ' c06 d06' ; r06 := execute t06 ; % ; test_tbl := 'OK' 'test' 'nDat' 'sum' 'std' 'result' (= s01 r01) 't01' c01 d01 s01 r01 (= s02 r02) 't02' c02 d02 s02 r02 (= s03 r03) 't03' c03 d03 s03 r03 (= s04 r04) 't04' c04 d04 s04 r04 (= s05 r05) 't05' c05 d05 s05 r05 (= s06 r06) 't06' c06 d06 s06 r06 ; n_cols := 6 ; n_rows := floor ( (gage shape test_tbl) / n_cols) ; write (n_rows n_cols reshape test_tbl) ; } +-----+ qnial> loaddefs link d_Qndfs 'emails - convert pdfCompilation to text.ndf' qnial> pEmails_doALL { NONLOCAL d_emails reference_Fauci title_contacts title_Subjects p_dictionary p_fixBodys p_fixHeads p_labelDates p_pdftotext p_spaces p_wordFrags p_contacts p_emailsPdf p_subjects sed_getContacts sed_dEmails sed_getSubject p_messyStuff p_formatHeads p_orgAcronym1 p_orgAcronym2 p_orgAcronym3 sed_messyStuff sed_formatHeads sed1_orgAcronym sed2_orgAcronym sed3_orgAcronym IF flag_break THEN BREAK ; ENDIF ; optr change : +.....+ +.....+ To : +.....+ +.....+ +-----+ olde code # loaddefs link d_Qndfs 'emails - convert pdfCompilation to text.ndf' IF flag_debug THEN write 'loading pEmailsRaw_removeAdd_spaces' ; ENDIF ; #] pTxt_pFrags_removeAdd_spaces IS OP pinn pFrags pout - #] pdic= dictionary of words (Linux); pwrd= words in pinn #] pdif= pwrds not in pdic; pFrags= each word and its fragments in pdiff # 19Jun2021 initial # 19Jun2021 the dictionary should be augmented by ripping quality health pages # of [disease, virus, drug]-related content (distant future project) # 20Jun2021 My coding is horribly inefficient, but endurably slow % pTxt_removeAdd_spaces pFrg p_spaces ; pTxt_pDic_extract_pFragsAndSubs p_fixHeads pFragsAndSubs pdic { LOCAL finn fout i line p_temp n_allSubFragL n_allSupFragL newStrL str strLenL strL supFragL ; NONLOCAL allSubFragL allSupFragL cullFragL cullSubFragL fragL n_fragL ; % ; IF (NOT AND (EACH path_exists ("p_old pinn) ("p_new pFrg) ) ) THEN EACH write '?pEmailsRaw_removeAdd_spaces error, file unknown, one of : ' pinn pout '' ; ELSE % ; % find all subs of frags (potential builders) ; allSubFragL := null ; FOR frag WITH fragL DO allSubFragL := allSubFragL append ((frag EACHRIGHT in allSupFragL) sublist fragL) ; ENDFOR ; n_allSubFragL := gage shape allSubFragL ; IF (~= n_fragL n_allSubFragL) THEN write '? error : (~= n_fragL n_allSubFragL)' ; ENDIF ; EACH write % each fragL includes at least the frag itself ; (link '(gage shape) fragL = ' (string n_fragL )) (link '(gage shape) allSubFragL = ' (string n_allSubFragL)) ; % ; % cull [fragL, allSubFragL] ; is_empty := (EACH (gage shape) allSubFragL) < 2 ; cullFragL cullSubFragL := is_empty EACHRIGHT sublist fragL allSubFragL ; write link '(gage shape) cullFragL = ' (string (gage shape cullFragL)) ; % ; % find constructors in cullFragL, 20Jun2021 consider just pairs of frags (later >= 2 eg triplets) ; % to reduce testing, start by ONLY considering pairs whose lengths equal that of the frag ; matchFragL := n_fragL reshape [null] ; i := 0 ; WHILE (n_fragL > i) DO fragPairs := (n_fragL EACHLEFT nDat_indxsSumToNdat_get_ij (gage shape cullFragL@i)) EACHLEFT pick cullFragL@i ; matchFragL@i := ((EACH link fragPairs) = cullFragL@i) sublist fragPairs ; nowWhat i := i + 1 ; ENDFOR ; ENDIF ; } # code in wait # 20Jun2021 # EACH (gage shape) fragL allSupFragL -->[stepv] resume (gage shape) fragL = 21687 (gage shape) allSupFragL = 21687 qnial> shaperSort := sortup EACH (gage shape) allSupFragL ; qnial> 10 take shaperSort 0 1 1 1 1 1 1 1 1 1 qnial> 10 takeright shaperSort 1027 1040 1042 1057 1073 1084 1118 1229 1553 1965 08********08 21Jun2021 'frags and subFrags.txt' 1. find frags that add up to strs in dictionary 2. byproduct is those that don't - cull those that can be constructed from dictionary 3. remanents - how many? and can I auto-extract "new words" easily? +-----+ 1. find frags that add up to strs in dictionary QNial to find frags that sum words in dictionary or frags % create QNial list of fragCombines that can be used to search for components and missing spaces ; -->[stepv] resume (gage shape) fragL = 21687 (gage shape) allSupFragL = 21687 qnial> shaperSort := sortup EACH (gage shape) allSupFragL ; qnial> 10 take shaperSort 0 1 1 1 1 1 1 1 1 1 qnial> 10 takeright shaperSort 1027 1040 1042 1057 1073 1084 1118 1229 1553 1965 % messyStuff p_pdftotext p_messyStuff sed_messyStuff ; % write '-> p_formatHeads : ' ; % formatHeads p_messyStuff p_formatHeads sed_formatHeads ; % write '-> p_orgAcronym1 : ' ; % orgAcronym1 p_formatHeads p_orgAcronym1 sed1_orgAcronym ; % write '-> p_orgAcronym2 : ' ; % orgAcronym2 p_orgAcronym1 p_orgAcronym2 sed2_orgAcronym ; % write '-> p_orgAcronym3 : ' ; % orgAcronym3 p_orgAcronym2 p_orgAcronym3 sed3_orgAcronym ; % write '-> pEmails_labelDates_pOut : ' ; % pEmails_labelDates_pOut p_orgAcronym3 p_labelDates ; +-----+ # code in wait % generate a sorted list of wordFrags of the emails ; % host link 'grep -o -E "\w+" "' pinn '" | grep --invert-match "[0-9]\+" | sort -u -f >"' pwrd '"' ; find lines of pwrd not in pdic, requires SORTED files in this case ; host link 'diff "' pdic '" "' pwrd '" | grep "^>" | grep --invert-match "[_]\+" | sed "s/^> //" | sort -u -f >"' pdif '"' ; writefile words Frags ; strL := strList_readFrom_path pdif ; % ; % grep isn't saving by ignoring diffs longer than subs, but has great [speed, flexibility] ; host 'echo "p_frag" >"' pFrg '"' ; FOR str WITH strL DO host link 'echo "<<:| " "' str '" >>"' pFrg '"' ; host link 'grep "' str '" "' pdif '" >>"' pFrg '"' ; ENDFOR ; % ; % create QNial list of fragCombines that can be used to search for components and missing spaces ; % "normal" words are in the dictionary, not in the diffL, so I will have to use the dictionary ; % HOWEVER - use of the diffL first may allow me to derive "plausible" new words specific to the domain? ; % double-check that pFrg was created ; % 20Jun2021 would it be much more efficient to preset (gage shapes) and use indexing rather than appends ? ; IF (NOT (path_exists ("p_old pFrg))) THEN EACH write '?pEmails_fixBodys_pout error, file unknown, one of : ' pFrg '' ; ELSE fragL supFragL allSupFragL := null null null ; finn := open pFrg "r ; WHILE (NOT isfault (line := readfile finn)) DO IF ('<<:| ' = (6 take line)) THEN frag := 6 drop line ; fragL := fragL append frag ; allSupFragL := allSupFragL append supFragL ; supFragL := null ; ELSE supFragL := supFragL append line ; ENDIF ; ENDWHILE ; % capture last sub ? ; % careful - watch for leading nulls ; close finn ; n_fragL n_allSupFragL := EACH (gage shape) fragL allSupFragL ; IF (~= n_fragL n_allSupFragL) THEN write '? error : (~= n_fragL n_allSupFragL)' ; ENDIF ; EACH write (link '(gage shape) fragL = ' (string n_fragL )) (link '(gage shape) allSupFragL = ' (string n_allSupFragL)) ; ENDIF ; % ; % find all subs of frags (potential builders) ; allSubFragL := null ; FOR frag WITH fragL DO allSubFragL := allSubFragL append ((frag EACHRIGHT in allSupFragL) sublist fragL) ; ENDFOR ; n_allSubFragL := gage shape allSubFragL ; IF (~= n_fragL n_allSubFragL) THEN write '? error : (~= n_fragL n_allSubFragL)' ; ENDIF ; EACH write (link '(gage shape) fragL = ' (string n_fragL )) (link '(gage shape) allSubFragL = ' (string n_allSubFragL)) ; % ; % cull [fragL, allSubFragL] ; is_empty := (EACH (gage shape) allSubFragL) < 2 ; cullFragL cullSubFragL := is_empty EACHRIGHT sublist fragL allSubFragL ; write link '(gage shape) cullFragL = ' (string (gage shape cullFragL)) ; # olde as available in a pdf document that was posted by Jason Leopold : ' 'https://www.documentcloud.org/documents/20793561-leopold-nih-foia-anthony-fauci-emails' 'See also the article by Natalie Bettendorf, Jason Leopold 01Jun2021 "Anthony Faucis Emails Reveal The Pressure That Fell On One Man", ' 'https://www.buzzfeednews.com/article/nataliebettendorf/fauci-emails-covid-response' 08********08 18Jun2021 use [diff, /usr/share/dict/american-english, 210609 Leopold - Anthony Fauci emails, NIH Freedom Of Information Act, 1st clean.txt] could also collect documents in that area of [health, medicine, science, policy] cull a specialised dictionary and names diff -> words [, not] in dictionaries grep match improper word with [, non-]viable [combined, split] [, sub] sequences space-[deleted, inserted] cart [before, after] improper word-sequences sed run through text and insert "{|:given, several alternate subs]:|}" later - context from documents in that area of [health, medicine, science, policy] where is the brocolli? diff_dictionary work via 'emails - convert pdfCompilation to text.ndf' I created : pEmls_removeAdd_spaces IS OP pinn pout pdic pwrd pEmails_addIntro IS OP pinn pout pintro - add introdution to pEmailsRaw # far too large [n_calc, strLen, n_file]s!!! % list wordFrags subWordFrags ; strL := strList_readFrom_path pdif ; % strL should be : ranked according to strLength!! Otherwise much of the calculation is a waste ; strLenL := null ; FOR str WITH strL DO strLenL := link strLenL (gage shape str) ; ENDFOR ; strLenL strL := EACH reverse (lists_sortupOn1st strLenL strL) ; newStrL := strL ; FOR str WITH strL DO subL := str ; FOR sub WITH newStrL DO IF (sub subStr_in_str str) THEN subL := link subL chr_tab sub ; ENDIF ; ENDFOR ; newStrL := rest newStrL ; ENDFOR ; write 'pTxt_removeAdd_spaces - writing results...' ; fout := open pSub "w ; subL strList_writeTo_path pSub ; close fout ; HUGE calcs and lists!!! qnial> mix EACH [pass, square, sum tell] (100 1000 10000 100000) 100 10,000 4,950 1,000 1,000,000 499,500 10,000 100,000,000 49,995,000 100,000 10,000,000,000 4,999,950,000 >> I'm surprised - sum (tell x) is a constant portion of square x >> if one considers 3 frags (2 spaces in word) or 3 joins, it goes up to nutso numbers! >> strLs could be far too large, crashing and losing everything after 1/2 billion calcs >> outputting of results to file for each frag - 20,000 files!! qnial> mix EACH [pass, square, sum tell] (5000 10000 15000 20000) 5,000 25,000,000 12,497,500 10,000 100,000,000 49,995,000 15,000 225,000,000 112,492,500 20,000 400,000,000 199,990,000 Maybe - crawl through file, address each non-dictionary situation individually, as the frags appear in [2,3 +] groups fix ones that are easy write to file ones that are not sorted search of all words in 50Mb 'Fauci corona virus emails, clean.txt' won't be fast either! line-by-line, NYET - try it first without : test the text in lowercase, NYET - don't appear to bee requent? : FIFO stack [old, new] lines - look for word breaks across lines % crawl through file, address each non-dictionary situation individually ; % nyet - not for the first attempt! dicL := EACH to_lowercase strList_readFrom_path pdic ; % nyet - not for the first attempt! lineL := EACH to_lowercase strList_readFrom_path pinn ; % hopefully sorted searches will help a lot? ; dicL := strList_readFrom_path pdic ; FOR line WITH lineL DO % deal with alpha-only words ; alphaL := host_result 'echo "' line '" | sed "^[A-Za-z]*/ /g" ' ; strL := alphaL str_cutBy_chr ` ; booL := strL EACHLEFT find_Howell pdif ; IF (OR booL) THEN ??? := subList (tell (gage shape strL)) ; newBooL := booL_notsToOne booL ; strGoodL := (NOT booL) sublist badL := booleanL_cut_list booL strL ; repairL := EACH link badL ; FOR i WITH (tell (gage shape) repairL) DO IF (isfault (repairL@i find_Howell dicL)) THEN % reduce sequential o's to one only ; strL := (NOT booL_seqosToOne booL) badL ; ENDIF ; ENDFOR ; writefile pfrag (link front (badL EACHLEFT link `)) ; ELSE writefile fout line ; ENDIF ; ENDFOR ; >> This is NUTS as well! seems best to construct a semi-permanent file of [merge, split] frags, even if that takes forever to calculate? qnial> loaddefs link d_Qndfs 'emails - generic optrs.ndf' qnial> pEmails_doALL >> started 19:53, stopped 20:15 - I will try grep qnial> loaddefs link d_Qndfs 'emails - generic optrs.ndf' qnial> pEmails_doALL >> holy crap the grep version is fast, but did it work? +-----+ olde code f_1stClean := 'Fauci emails, 1st clean.txt' ; f_emailsClean := 'Anthony Fauci corona virus emails, clean.txt' ; # % [pdif, strL] should be : ranked according to strLength!! ; % Otherwise much of the calculation is a waste, horribly inefficient ; strLenL := null ; FOR str WITH strL DO strLenL := link strLenL (gage shape str) ; ENDFOR ; strLenL strL := EACH reverse (lists_sortupOn1st strLenL strL) ; newStrL := strL ; fFrg := open pFrg "w ; IF (sub subStr_in_str str) THEN writefile fFrg (link str chr_tab sub) ; ENDIF ; close fFrg ; 08********08 18Jun2021 fixing the spaces within words from pdftotext take a break from [sed1_orgAcronym sed2_orgAcronym sed3_orgAcronym] simple idea - soted list of all [alpabetic, alphanumeric] cart [1,2] word sequences, with counts and within the document ONLY alternate - use Linux dictionary much more general also do both +-----+ command line spelling checker that I use : "$d_SysMaint""text processors/aspell spelling checker notes.txt" can one access t of words directly? $ aspell check "$d_PROJECTS""Neural Nets/People/James, Colin/210303 Howell - Colin James Missouri S&TU recommentation.txt" this is interactive in a document +-----+ https://en.wikipedia.org/wiki/Words_(Unix) words is a standard file on Unix and Unix-like operating systems, and is simply a newline-delimited list of dictionary words. It is used, for instance, by spell-checking programs.[1] The words file is usually stored in /usr/share/dict/words or /usr/dict/words. On Debian and Ubuntu, the words file is provided by the wordlist package, or its provider packages wbritish, wamerican, etc. On Fedora and Arch Linux, the words file is provided by the words package. >> on my system, /usr/share/dict/words links to : /etc/dictionaries-common/words which in turn links to : /usr/share/dict/american-english >> 99,171 words, it has prenames etc >> looks good for general stuff +-----+ start by making a list of alpha-numerics in the Faui emails which Unix tool? search "linux list words in a text file" +--+ https://stackoverflow.com/questions/16489317/how-to-generate-list-of-unique-words-from-text-file-in-ubuntu $ grep -o -E '\w+' temp | sort -u -f $ | tr -cs 'a-zA-Z0-9' '\n' The -c is for the complement of the specified characters; the -s squeezes out duplicates of the replacements; the 'a-zA-Z0-9' is the set of alphanumerics, if you add a character here, the input won't get delimited on that character (see another example below); the '\n' is the replacement character (newline). Try tran : $ cat "$d_webRawe""Pandemics, health, and the Sun/corona virus/Fauci covid emails/210609 Leopold - Anthony Fauci emails, NIH Freedom Of Information Act, 1st clean.txt" | tr -cs 'a-zA-Z0-9' '\n' >"$d_webRawe""Pandemics, health, and the Sun/corona virus/Fauci covid emails/6_word list.txt" >> 44.9 Mb file, but not sorted Use grep $ grep -o -E '\w+' "$d_webRawe""Pandemics, health, and the Sun/corona virus/Fauci covid emails/210609 Leopold - Anthony Fauci emails, NIH Freedom Of Information Act, 1st clean.txt" | sort -u -f >"$d_webRawe""Pandemics, health, and the Sun/corona virus/Fauci covid emails/6_word list.txt" >> 295.4 kb file, sorted get rid of numeric-only $ grep --invert-match "[0-9]\+" "$d_webRawe""Pandemics, health, and the Sun/corona virus/Fauci covid emails/6_word list.txt" | grep --invert-match "[_]\+" >"$d_webRawe""Pandemics, health, and the Sun/corona virus/Fauci covid emails/6_word non-numeric list.txt" >> 249.6 kb file, 31455 "words", sorted $ grep --invert-match "[0-9]\+" "$d_webRawe""Pandemics, health, and the Sun/corona virus/Fauci covid emails/6_word list.txt" | sed "s/[_]\+//g" | sort -u >"$d_webRawe""Pandemics, health, and the Sun/corona virus/Fauci covid emails/7_word non-[numeric, empty] list.txt" >> no good, just manually delete the 30 empty lines, I'm too lazy >> 251.0 kb, 31,607 words It appears that I have to split run-together words too. Now do a word count for each : bash script most words don't have a count? is there a null or somethin rater than a space? qnial> EACH charrep 'M I' 77 32 73 >> nyet example sequence (incomplete) : +--+ 0\tInstitu 0\tinstituciones 0\tInstitut 0\tInstitute 0\tinstituted 0\tInstitutefor 0\tInstitutes 0\tInstitutesoJHea 0\tinstituting 0\tInstitution 0\tinstitutional 0\tinstitutionali 0\tinstitutionally 0\tinstitutions 0\tinstr 0\tinstrncted 0\tinstruct 0\tinstructed 0\tinstruction 0\tInstructional 0\tinstructions 0\tinstructive 0\tInstructor 0\tinstrument +--+ # enddoc