' | sed 's|

$.*$, $[a-zA-Z]\{3\}$$.*$ $[0-9]\+$

|2022-\2-\4|' ) if [ "$new_date" != "" ]; then echo "$new_date
" >>"$p_news" echo "
" >>"$p_news" echo "" >>"$p_news" old_date="$new_date" fi out_article=$( echo "$line" | grep '

[ ]\+

|
\n\n|;s|
||;s|
||;s|
||;s| title=\".\/">|\">|;s|
||;s|
||;s|
' | sed 's|$.$
$.$<\/p>|\2|' ) #echo "$content" if [ "$out_article" != "" ]; then inn_article="" echo "
" >>"$p_news" echo "" >>"$p_news" elif [ "$link" != "" ]; then echo "$old_date $old_time $link
" >>"$p_news" elif [ "$content" != "" ]; then echo "$content
" >>"$p_news" fi done fi done 9< "$p_source" else echo "kyivindependent.com page source.txt doesnt exist : $p_source" fi } #] ukrinform_news() - extract news items from page source # 06Mar2022 initial # 08Mar2022 not handy yet : # cannot "save Page As" or "Web developer -> Save as" before current day # for now copy-paste from webPage, add URLs (time-consuming!) ukrinform_news() { p_source="$d_temp"'ukrinform.net page source.html' p_news="$d_temp"'ukrinform.net news articles.html' echo "" >"$p_news" if [ -f "$p_source" ]; then old_date="2022-03-14" while read -u 9 line; do new_date=$( echo "$line" | grep '$.$<\/a><\/h2>| \2<\/a> |' ) echo "$new_date $link
" >>"$p_news" paragraph="" flagStop="" while [ "$flagStop" == "" ]; do read -u 9 line flagStop=$( echo "$line" | grep '' ) if [ "$flagStop" == "" ]; then paragraph="$paragraph $line" fi done paragraph=$( echo "$paragraph" | sed 's|
| |g;s|
| |g' ) #echo "$paragraph" echo "$paragraph
" >>"$p_news" echo "
" >>"$p_news" echo "" >>"$p_news" fi done 9< "$p_source" else echo "ukrinform.net page source.txt doesnt exist : $p_source" fi } # $ grep ' #
[ ]\+
# replace
\n\n # echo '
Sunday, March 6
' | sed 's|
$.$, $[a-zA-Z]\{3\}$$.$ $[0-9]\+$
|\4 \2|' # echo '
Russian dictator Vladimir Putin said during a phone call with French President Emmanuel Macron that a meeting with the International Atomic Energy Agency devoted to Ukrainian nuclear facilities may take place outside Ukraine or as part of a video conference. Russia had shelled the Zaporizhzhia Nuclear Power Plant on March 4.
' | grep '
' | sed 's|$.$
$.$
|\2|' # echo '16:08' | grep '' | sed 's|.$.$|\1|' #] remove_redundancy() - # remove_redundancy() # { # # pick up one new paragraph at a time in input file "$1" # grep item in output file "$d_temp"'webNews grep a href paragraph.html' # not in : add to "$d_temp"'webNews grep a href paragraph.html' # in : ignore # sort output file -> "$d_temp"'webNews no redundancy.html' # } #] rm_lineSeq_start_to_end() - make copy of text file, removing all occurrences of a sequence of lines # 08Mar2022 initial # see "$d_bin""fileops.sh" - collection of handy bash scripts for handling file needs #******************************************************************************* # Procedures - # 0. $ bash" section below, uncomment function to run # kyivindependent_news() Scrape (rip) news feeds : # 1. copy-paste "https://kyivindependent.com/news-archive/" one-page-at-a-time source to : # "$d_temp"'kyivindependent.com page source.txt' # 2. edit this bash script, kyivindependent_news() : # change old_date="2022-Mar-14" (something like that) to current date # 3. in "run" section at the bottom of this script file : # uncomment kyivindependent_news, comment out other executables # 4. process with bash script # $ bash "$d_bin"'webPage scrape news.sh' # outputs go to "$d_temp"'kyivindependent.com news articles.txt' # 5. geany regexpr edit of 'kyivindependent.com news articles.txt' # search : ^(2022\-)Mar\- # replace: \103- # 6. remove redundant news items : # manually page down & remove junk : # will write bash script later # 7. copy-paste updated parts to : # "$d_webRawe"'History/Ukraine-Russia/kyivindependent.com news log.html' # ukrinform_news() Scrape (rip) news feeds : # 1. keep scrolling down webPage "https://www.ukrinform.net/block-lastnews" # to last date-time a download was made # Ctrl-A Ctrl-C to [select, copy] the whole webPage # FireFox Menu -> Save page as -> popup window # -> specify ["Web Page complete", "$d_temp"'ukrinform.net page source.html'] # 2. edit this bash script, ukrinform_news() : # change old_date="2022-03-14" (something like that) to current date # 3. in "run" section at the bottom of this script file : # uncomment ukrinform_news, comment out other executables # 4. process with bash script # $ bash "$d_bin"'webPage scrape news.sh' # outputs go to "$d_temp"'ukrinform.net news articles.html' # 5. remove redundant news items : # manually page down & remove junk : # will write bash script later # 6. copy-paste updated parts to : # "$d_webRawe"'History/Ukraine-Russia/ukrinform.net news log.html' # ukrinform.net - manually procedure fall-back # ukrinform.net news stream : page source does NOTlude what is visible # (eg scroll down to previous day, but can capture only current day-time) # select text-only of news stream # -> copy into this file # -> insert empty lines when no "extra comment" for item # -> select new text # -> manual copy-paste of links (too long to do this!, at least have [title, date, time] and can track later # geany multi-line regexp : (geany uses + instead of \+) # search : .\n(.)\n(.)\n\n(.)\n # replace : 2022\-03\-08 \1 \2
\n\3
\n
\n #************ # run $ bash "$d_bin"'webPage scrape news.sh' # kyivindependent_news ukrinform_news # remove_redundancy "$d_webRawe"'History/Ukraine-Russia/kyivindependent.com news log.html' # remove_redundancy "$d_webRawe"'History/Ukraine-Russia/ukrinform.net news log.html' # remove_redundancy "$d_webRawe"'History/Ukraine-Russia/news items except [ukrinform, kyivindependent].html' # rm_lineSeq_start_to_end is in "$d_bin""fileops.sh" # rm_lineSeq_start_to_end "$d_webRawe"'History/Ukraine-Russia/' "$d_temp" 'kyivindependent.com news log.html' 'Civilians flee in terror as Ukraine’s military deter Russia in Irpin' 'Gofundme' # enddoc

|\n\n|;s|

||;s| title=\".*\/">|\">|;s|