# /media/bill/SWAPPER/Qnial/MY_NDFS/emails from conf papers.ndf # www.BillHowell.ca initial 12Feb2018 lq_confPapers IS loaddefs link d_Qnial_mine 'emails from conf papers.ndf' write '>>>>>>>>>>>>>>>' ; write 'loading "emails from conf papers.ndf"' ; #**************************** # Process # 1. make sure data file is ASCII, not Unicode # 2. run program # 3. past output list into spreadsheet # 4. sort list # 5. cull out david.brown etc # 6. cull out mailer deamon, postmaster, hard to confirm system emails # 7. cull out duplicates by using something like: =IF(B398=B397;1;0) # 8. check for "Unauthorized" emails # 9. append "clean" list after "End of list" in "RemoveEmails" worksheet in "IJCNN07 email promo list 061119.ods" # 10. move "End of list" and extend the range of "undeliverable" range in "RemoveEmails" worksheet # 11. sort the list by email # 12. cull out duplicates by using something like: =IF(B398=B397;1;0) # NOTE: as legitimate emails can be rejected for communications glitches etc, there should be a process to # "add them back in" # main code of the routines # so far doesn't check for non-existant files etc #******************************************* # Debugging - see "/media/bill/SWAPPER/Qnial/0_bug hunt QNial.txt" flag_break := l ; flag_debug := o ; # Reminder of debug setup... # IF flag_break = l THEN Break ; ENDIF ; write_debug IS OP AAA { NONLOCAL flag_debug ; IF flag_debug THEN write AAA ; ENDIF ; } #*************************************** # global variables and pre-defs #+-----+ # These are loaded in next sub-section by 'email - extract, sort, cull addresses from text.ndf' # *regexp_substitute* R S T [O] replace one or more substrings in string T that match the regular expression pattern R with string S # Content - Structure.ndf # sd_ IS OP s_raw : structure definition # 07Aug2017 for now only for simple 2D array # sg_ IS OP struct s_lab : get component of structure ; # sp_ IS OP sdat s_lab - [put,change] component of structure # "place" usage is probably wrong! # sm_ IS OP sdat s_lab - apply internal method to structure # 08Aug2017 not yet set up # #IF not in "SG_ (EACH first symbols 0) THEN loaddefs (link d_QNial_mine 'Structure.ndf') 0 ; ENDIF; #+-----+ IF not in "EMAILS_EXTRACT_FROM_FILE (EACH first symbols 0) THEN loaddefs (link d_QNial_mine 'email - extract, sort, cull addresses from text.ndf') 0 ; ENDIF; white_chars := ' "[]<>();,' ; white_chars_email := ' "[]<>();,:?' ; # from 'strings.ndf' #char_apostrophe := char 39 ; write 'done loading preliminaries' ; #*************************************** # Operators # 13Feb2018 - This is in 'email - extract, sort, cull addresses from text.ndf' # unicode_to_ascii IS OP fin_name fas_name - Use this to reduce the number of problematic character codes. unicode_to_ascii IS OP fin_name fas_name { host (link 'iconv -f utf-8 -t ascii -c "' fin_name '" > "' fas_name '" ') ; } # unicode_to_ascii '/media/bill/SWAPPER/2017 NIPS Long Beach, California/0_NIPS 2017 URLs all raw.txt' '/media/bill/SWAPPER/2017 NIPS Long Beach, California/0_NIPS 2017 URLs all.txt' # Commentary on problems with extract_paperURLs : # see "/media/bill/SWAPPER/Qnial/0_QNial notes.txt" extract_paperURLs IS OP p_URLs_stub { LOCAL d_conference ender fin fotPapersp_paperURLs p_confList p_URLs_raw p_paperURLs p_authorURLs URLpaper URLauthor ; %fotAuthors ; d_conference := '/media/bill/SWAPPER/2017 NIPS Long Beach, California/' ; p_confList := link d_conference '0_NIPS 2017 Proceedings page.html' ; p_URLs_raw := link d_conference '0_NIPS 2017 URLs all raw.txt' ; p_URLs := link d_conference p_URLs_stub '.txt' ; p_paperURLs := link d_conference p_URLs_stub ' papers.txt' ; p_authorURLs := link d_conference '0_NIPS 2017 URLs author.txt' ; % ; URLpaper := 'https://papers.nips.cc/paper/' ; to URLauthor := 'https://papers.nips.cc/author/' ; ender := '">' ; % ; %grepexpr := link 'grep "' URLpaper '" "' p_confList '" >"' p_URLs_raw '" ' ; %write grepexpr ; %host grepexpr ; % Ensure only ASCII characters are in file %unicode_to_ascii p_URLs_raw p_URLs ; % ; fin := open p_URLs "r ; fotPapers := open p_paperURLs "w ; %fotAuthors := open p_authorURLs "w ; WHILE (~= ??eof (line := readfile fin)) DO %write 'in extract_paperURLs' ; %write status ; %write line ; % EACH write (strings_between URLpaper ender line) ; fotPapers EACHRIGHT writefile (strings_between URLpaper ender line) ; host 'sleep 0.25' ; ENDWHILE ; EACH close fin fotPapers ; } # 13Feb2018 I was unable to download ([username, password] problem with ftp), so I used : see "/media/bill/SWAPPER/System_maintenance/internet & wifi/web downloads - nips-grab & BeautifulSoup.txt" save_paperURL IS OP p_URL { LOCAL d_conference ender fin fotPapersp_paperURLs p_confList p_URLs_raw p_paperURLs p_authorURLs URLpaper URLauthor ; d_conference := '/media/bill/SWAPPER/2017 NIPS Long Beach, California/' ; p_paper := link d_conference p_URL ; % ; % fake line - must adapt for https download ; host link 'rsync -aru "' d_conference '" "' p_paper '" ' ; } # save_papers IS { null } # 14Feb2018 Cull emails from NIPS2017 papers $ cd "/media/bill/SWAPPER/2017 NIPS Long Beach, California/" $ find -iname "*.pdf" | sed 's/\(^\.\/\)\(.*\)/\2/' | sort >"0_NIPS2017 paper list.txt" sd_ "NIPS2017_emails_extract { "d_inn '/media/bill/SWAPPER/2017 NIPS Long Beach, California/' "f_inn '0_NIPS2017 pdf.txt' "d_out '/media/bill/SWAPPER/2017 NIPS Long Beach, California/' "root '0_NIPS2017 author emails' "test temp_flag } ; # f_autEmMan below is a direct consequence of root above via emails_extract_from_file # testing : "f_paperList '0_NIPS2017 paper list1.txt' # prodn : "f_paperList '0_NIPS2017 paper list.txt' sd_ "NIPS2017_emails { "d_wrk '/media/bill/SWAPPER/2017 NIPS Long Beach, California/' "f_paperList '0_NIPS2017 paper list.txt' "f_pdfTxt '0_NIPS2017 pdf.txt' "f_autEmMan '0_NIPS2017 author emails 4 manual review & clean.txt' "f_autEmCum '0_NIPS2017 author emails cumulative.txt' "f_authorEmails '0_NIPS2017 author emails.txt' } ; # p_pdfTxt - is over-written for each paper get_confPaper_emails IS OP confExtr_phr confEmails_phr { LOCAL f_paperList f_pdfTxt f_autEmMan f_autEmCum f_authorEmails d_wrk p_paperList p_pdfTxt p_autEmMan p_autEmCum p_authorEmails ; d_wrk f_paperList f_pdfTxt f_autEmMan f_autEmCum f_authorEmails := confEmails_phr EACHRIGHT sg_ "d_wrk "f_paperList "f_pdfTxt "f_autEmMan "f_autEmCum "f_authorEmails ; p_paperList p_pdfTxt p_autEmMan p_autEmCum p_authorEmails := d_wrk EACHRIGHT link f_paperList f_pdfTxt f_autEmMan f_autEmCum f_authorEmails ; % ; %BREAK ; fin_paperList := open p_paperList "r ; % ; WHILE (~= ??eof (line := readfile fin_paperList)) DO write line ; host link 'pdftotext "' (link d_wrk line) '" "' p_pdfTxt '" ' ; emails_extract_from_file "NIPS2017_emails_extract ; host link 'cat "' p_autEmMan '" ' ' >>"' p_autEmCum '" ' ; ENDWHILE ; close fin_paperList ; % ; host 'cat "' p_autEmCum '" | sort --unique >"' p_authorEmails '" ' ; } # lq_confPapers # get_confPaper_emails "NIPS2017_emails_extract "NIPS2017_emails # sg_ "NIPS2017_emails "f_paperList $ cd "/media/bill/SWAPPER/2017 NIPS Long Beach, California/" $ cat "0_NIPS2017 author emails cumulative.txt" | sort --unique >"0_NIPS2017 author emails.txt" # emails_extract_from_file # 14Feb2018 19:10 full run of 679 papers # Estimated time : # RaspPi : 8 sec/paper * 679 = 5432 seconds / 60 s/min = 90.5 minutes # LMDE2 - faster, but not double speed # This is NOT working - why ? $ pdftotext "6606-wider-and-deeper-cheaper-and-faster-tensorized-lstms-for-sequence-learning.pdf" >> elmahdi.elmhamdi@epfl.ch # $ cd "/media/bill/SWAPPER/2017 NIPS Long Beach, California/" $ no workee : ls -1 | grep ".bib" | xargs cat | grep "Toward Multimodal Image-to-Image Translation" # Errors 6634-on-structured-prediction-theory-with-calibrated-convex-surrogate-losses.pdf Syntax Warning: Bad annotation destination ... 7280-on-separability-of-loss-functions-and-revisiting-discriminative-vs-generative-models.pdf Syntax Warning: Illegal annotation destination ... # emails from text docs qnial> loaddefs link d_Qnial_mine "email - extract, sort, cull addresses from text.ndf" qnial> emails_extract_from_file () write 'done loading "emails from conf papers.ndf"' ; write '<<<<<<<<<<<<<<<' ; # enddoc