#!/bin/sh
#] 
#] *********************
#] $ bash "$d_bin"'webPage scrape news.sh' - j
# www.BillHowell.ca  06Mar2022 initial 
# view in text editor, using constant-width font (eg courier), tabWidth = 3


#************************
# List of operators, generated with :
# $ grep  "^#]"  "$d_bin""webPage scrape news.sh" |  sed "s/^#\]/  /" 
#  *********************
#  $ bash "$d_bin"'webPage scrape news.sh' - j
#  kyivindependent_news()  - extract news items from page source
#  ukrinform_news()  - extract news items from page source
#  remove_redundancy()  - 
#  rm_lineSeq_start_to_end()  - make copy of text file, removing all occurrences of a sequence of lines


#************************
# Setup 

source  "$d_bin""standard header.sh"

#************
# code 


#] kyivindependent_news()  - extract news items from page source
# 06Mar2022 initial
	kyivindependent_news()
{  
	p_source="$d_temp"'kyivindependent.com page source.txt'
	p_news="$d_temp"'kyivindependent.com news articles.html'
if  [ -f  "$p_source" ]; then 
	echo ""  >"$p_news"
	old_date="2022-Mar-14"
	while  read -u 9 line; do 
		new_date=$(  echo "$line"  |  grep '<div class="new-day">' |  sed  's|<div class=\"new-day\">\(.*\), \([a-zA-Z]\{3\}\)\(.*\) \([0-9]\+\)</div>|2022-\2-\3|'  )  
		if  [ "$new_date" !=  "" ]; then 
			echo  "<B>$new_date</b><BR>" >>"$p_news"
			echo  "<BR>"	>>"$p_news"
			echo  ""			>>"$p_news"
			old_date="$new_date"
		fi
		inn_article=$(  echo "$line"  |  grep '<article id="post-'  )  
		if  [ "$inn_article" !=  "" ]; then 
			while  read -u 9 line; do 
				# 14Mar2022 note that month shortName is extracted, eg "Mar", later convert to number, eg 03
				new_date=$(  echo "$line"  |  grep '<div class="new-day">' |  sed  's|<div class=\"new-day\">\(.*\), \([a-zA-Z]\{3\}\)\(.*\) \([0-9]\+\)</div>|2022-\2-\4|'  )  
				if  [ "$new_date" !=  "" ]; then 
					echo  "<B>$new_date</b><BR>" >>"$p_news"
					echo  "<BR>"	>>"$p_news"
					echo  ""			>>"$p_news"
					old_date="$new_date"
				fi
				out_article=$(  echo "$line"  |  grep '<div class=\"clearfix\">'  )
				# Note: grep escape "-/ don't escape stuff within chr_apos
				new_time=$( echo "$line" |  grep '<span class="recent-date">'  |  sed  's|.*<span class="recent-date">\(.*\)</span>|\1|'  )
				if		[  "$new_time"  !=  ""  ]; then
						old_time="$new_time"
				fi
				link=$( echo "$line" |  grep '<a href="https://'  |  sed  's|[ ]\{2,\}||;s| rel=\"nofollow\" target=\"_blank\"||;s|</h3></div><div class="item">[ ]\+<h3 class="entry-title">|<BR>\n\n|;s|<div class=\"post\-thumb\">||;s|<div class=\"item\">||;s|<h3 class=\"entry\-title\">||;s| title=\".*\/">|\">|;s|</h3>||;s|</div>||;s|<a href=.*<img width=.*||' )
				content=$( echo "$line" |  grep '<span class="post-excerpt"><p>'  |  sed  's|\(.*\)<span class=\"post\-excerpt\"><p>\(.*\)<\/p>|\2|' )
				#echo "$content" 
				if		[ "$out_article" !=  "" ]; then 
						inn_article=""
						echo  "<BR>"			>>"$p_news"
						echo  ""					>>"$p_news"
				elif	[  "$link"	  !=  ""  ]; then 
						echo  "$old_date $old_time $link<BR>" 	>>"$p_news"
				elif	[  "$content" !=  ""  ]; then 
						echo  "$content<BR>" >>"$p_news"
				fi
			done
		fi
	done 9< "$p_source"  
else echo  "kyivindependent.com page source.txt doesnt exist : $p_source"
fi 
}  




#] ukrinform_news()  - extract news items from page source
# 06Mar2022 initial
# 08Mar2022 not handy yet :
#		cannot "save Page As" or "Web developer -> Save as" before current day
#		for now copy-paste from webPage, add URLs (time-consuming!)

	ukrinform_news()
{  
	p_source="$d_temp"'ukrinform.net page source.html'
	p_news="$d_temp"'ukrinform.net news articles.html'
	echo ""  >"$p_news"
if  [ -f  "$p_source" ]; then 
	old_date="2022-03-14"
	while  read -u 9 line; do 
		new_date=$(  echo "$line"  |  grep '<time datetime="' |  sed  's|<time datetime=\"\([0-9-]\+\)T\(.*\):\(.*\):\(.*\)[+].*|\1 \2:\3|'  )  
		if  [ "$new_date" !=  "" ]; then 
			read -u 9 line
			link=$(    echo "$line" |  sed  's|.*<h2><a href=\"\(.*\)\">\(.*\)<\/a><\/h2>| <a href=\"\1\">\2<\/a> |' )
			echo  "$new_date $link<BR>"  >>"$p_news"

			paragraph=""
			flagStop=""
			while [ "$flagStop" ==  "" ];  do 
				read -u 9 line
				flagStop=$(  echo "$line"  |  grep '</section>'  )  
				if  [ "$flagStop" ==  "" ];  then 
					paragraph="$paragraph $line"
				fi
			done  
			paragraph=$( echo "$paragraph" |  sed  's|<p>| |g;s|</p>| |g' )
			#echo "$paragraph" 
			echo  "$paragraph<BR>"	>>"$p_news"
			echo  "<BR>"				>>"$p_news"
			echo  ""						>>"$p_news"
		fi
	done 9< "$p_source"  
else echo  "ukrinform.net page source.txt doesnt exist : $p_source"
fi 
}  

# $ grep  '<a href="'  "$d_temp"'kyivindependent.com page source.txt'
# don't occur in <a href= line of articles : 
# 		<h3 class=\"entry\-title\">
#		<img .*<a href=
# does occur for linked images - remove these
#		<a href=.*<img width=.*
#		
#		

# special problems : 
#		<a href=.*<a href=			occurs in two articles of test file
#			1. search  </h3></div><div class="item">[ ]\+<h3 class="entry-title">
#				replace <BR>\n\n

# echo  '<div class="new-day">Sunday, March 6</div>'  |  sed  's|<div class=\"new-day\">\(.*\), \([a-zA-Z]\{3\}\)\(.*\) \([0-9]\+\)</div>|\4 \2|'

# echo  '     <span class="post-excerpt"><p>Russian dictator Vladimir Putin said during a phone call with French President Emmanuel Macron that a meeting with the International Atomic Energy Agency devoted to Ukrainian nuclear facilities may take place outside Ukraine or as part of a video conference. Russia had shelled the Zaporizhzhia Nuclear Power Plant on March 4.</p>'  |  grep '<span class=\"post\-excerpt\"><p>'  |  sed  's|\(.*\)<span class=\"post\-excerpt\"><p>\(.*\)</p>|\2|'

# echo '<!--      --><span class="recent-date">16:08</span>'  |  grep '<span class="recent-date">'  |  sed  's|.*<span class="recent-date">\(.*\)</span>|\1|' 



#] remove_redundancy()  - 

# 	remove_redundancy()
# {	
# 	
# 	pick up one new paragraph at a time in input file "$1" 
# 		grep item in output file	"$d_temp"'webNews grep a href paragraph.html'
# 		not in : add to  				"$d_temp"'webNews grep a href paragraph.html' 
# 		in		 : ignore
# 	sort output file -> 				"$d_temp"'webNews no redundancy.html'
# }  


#] rm_lineSeq_start_to_end()  - make copy of text file, removing all occurrences of a sequence of lines
# 08Mar2022 initial
# see "$d_bin""fileops.sh" - collection of handy bash scripts for handling file needs



#*******************************************************************************
# Procedures -  

# 0.  $ bash" section below, uncomment function to run 

#	kyivindependent_news() Scrape (rip) news feeds :
#	1. copy-paste "https://kyivindependent.com/news-archive/" one-page-at-a-time source to : 
#			"$d_temp"'kyivindependent.com page source.txt'
#	2. edit this bash script, kyivindependent_news() :
#			change old_date="2022-Mar-14" (something like that) to current date
#	3.	in "run" section at the bottom of this script file :
#			uncomment kyivindependent_news,  comment out other executables
#	4. process with bash script 	
#			$ bash  "$d_bin"'webPage scrape news.sh'
#			outputs go to "$d_temp"'kyivindependent.com news articles.txt'
#	5. geany regexpr edit of 'kyivindependent.com news articles.txt'
#			search :	^(2022\-)Mar\-<no endSpace>
#			replace:	\103-<no endSpace>
#	6. remove redundant news items :
#			manually page down & remove junk : 
#			will write bash script later
#	7. copy-paste updated parts to :
#			"$d_webRawe"'History/Ukraine-Russia/kyivindependent.com news log.html'


#	ukrinform_news() Scrape (rip) news feeds :
#	1. keep scrolling down webPage "https://www.ukrinform.net/block-lastnews" 
#			to last date-time a download was made
#			Ctrl-A Ctrl-C to [select, copy] the whole webPage
#			FireFox Menu ->  Save page as -> popup window 
#				-> specify ["Web Page complete", "$d_temp"'ukrinform.net page source.html']
#	2. edit this bash script, ukrinform_news() :
#			change old_date="2022-03-14" (something like that) to current date
#	3.	in "run" section at the bottom of this script file :
#			uncomment ukrinform_news,  comment out other executables
#	4. process with bash script 
#			$ bash  "$d_bin"'webPage scrape news.sh'
#			outputs go to "$d_temp"'ukrinform.net news articles.html'
#	5. remove redundant news items :
#			manually page down & remove junk : 
#			will write bash script later
#	6. copy-paste updated parts to :
#			"$d_webRawe"'History/Ukraine-Russia/ukrinform.net news log.html' 


# ukrinform.net - manually procedure fall-back
#	ukrinform.net news stream : page source does NOTlude what is visible 
#		(eg scroll down to previous day, but can capture only current day-time)
#	select text-only of news stream 
#		-> copy into this file 
#		-> insert empty lines when no "extra comment" for item
#		-> select new text
#		-> manual copy-paste of links (too long to do this!, at least have [title, date, time] and can track later
#	geany multi-line regexp :  (geany uses + instead of \+)
#		search	: .*\n(.*)\n(.*)\n\n(.*)\n
#		replace	: 2022\-03\-08 \1 <A HREF="">\2</a><BR>\n\3<BR>\n<BR>\n


#************
# run $ bash  "$d_bin"'webPage scrape news.sh'  

#	kyivindependent_news
	ukrinform_news

#	remove_redundancy  "$d_webRawe"'History/Ukraine-Russia/kyivindependent.com news log.html'
#	remove_redundancy  "$d_webRawe"'History/Ukraine-Russia/ukrinform.net news log.html'
#	remove_redundancy  "$d_webRawe"'History/Ukraine-Russia/news items except [ukrinform, kyivindependent].html'

#	rm_lineSeq_start_to_end is in "$d_bin""fileops.sh" 
#	rm_lineSeq_start_to_end  "$d_webRawe"'History/Ukraine-Russia/'  "$d_temp"  'kyivindependent.com news log.html'      'Civilians flee in terror as Ukraine’s military deter Russia in Irpin'  'Gofundme'  





# enddoc
