diff --git a/lib/ag/scraping.rb b/lib/ag/scraping.rb
index e0c49dc..dc31b1e 100644
--- a/lib/ag/scraping.rb
+++ b/lib/ag/scraping.rb
@@ -74,133 +74,55 @@ def validate_programs(programs)
exit
end
programs.delete_if do |program|
- program.title == '放送休止'
+ program.title == '番組休止中' || program.title == '放送休止'
end
end
-
def scraping_page
- res = HTTParty.get('http://www.agqr.jp/timetable/streaming.html')
+ target_date = Date.today.next_day
+ res = HTTParty.get("https://www.joqr.co.jp/qr/agdailyprogram/?date=#{target_date.strftime('%Y%m%d')}")
dom = Nokogiri::HTML.parse(res.body)
- tbody = dom.css('.timetb-ag tbody') # may be 30minutes belt
- td_list_list = parse_broken_table(tbody)
- two_dim_array = table_to_two_dim_array(td_list_list)
- day_time_array = join_box_program(transpose(two_dim_array))
- day_time_array.each_with_index.inject([]) do |programs, (programs_day, index)|
- programs + parse_day(programs_day, index)
- end
- end
-
- def parse_broken_table(tbody)
- # time table HTML is broken!!!!!! some row aren't opened by
.
- td_list_list = []
- td_list_tmp = []
- tbody.children.each do |tag|
- if tag.name == 'td'
- td_list_tmp.push tag
- elsif tag.name == 'tr' || tag.name == 'th'
- unless td_list_tmp.empty?
- td_list_list.push td_list_tmp
- td_list_tmp = []
- end
- if tag.name == 'tr'
- td_list_list.push tag.css('td')
- end
- end
- end
- unless td_list_tmp.empty?
- td_list_list.push td_list_tmp
- end
- td_list_list
- end
-
- def parse_day(programs_day, index)
- wday = (index + 1) % 7 # monday start
- programs_day.map do |td|
- parse_td_dom(td, wday)
- end
- end
-
- def table_to_two_dim_array(td_list_list)
- aa = []
- span = {}
- td_list_list.each_with_index do |td_list, row_n|
- a = []
- col_n = 0
- td_list.each do |td|
- while span[[row_n, col_n]]
- a.push(nil)
- col_n += 1
- end
- a.push(td)
- cspan = 1
- if td['colspan'] =~ /(\d+)/
- cspan = $1.to_i
- end
- rspan = 1
- if td['rowspan'] =~ /(\d+)/
- rspan = $1.to_i
- end
- (row_n...(row_n + rspan)).each do |r|
- (col_n...(col_n + cspan)).each do |c|
- span[[r, c]] = true
- end
- end
- col_n += 1
- end
- aa.push(a)
- end
- aa
+ program_items = dom.css('.dailyProgram-itemBox')
+ parse_program(program_items, target_date.wday)
end
- def transpose(two_dim_array)
- max_size = two_dim_array.max_by{|i| i.size }.size
- filled = two_dim_array.map{|i| i.fill(nil, i.size...max_size) }
- filled.transpose
- end
-
- def join_box_program(day_time_array)
- day_time_array.map do |day|
- day.inject([]) do |programs, td|
- unless td
- next programs
- end
- time = td.css('.time')[0].text
- if time.include?('頃')
- programs.last['rowspan'] = programs.last['rowspan'].to_i + td['rowspan'].to_i
- next programs
- end
- programs << td
- end
+ def parse_program(program_items, wday)
+ program_items.map do |item|
+ start_time = parse_start_time(item, wday)
+ minutes = parse_minutes(item, wday)
+ title = parse_title(item)
+ Program.new(start_time, minutes, title);
end
end
- def parse_td_dom(td, wday)
- start_time = parse_start_time(td, wday)
- minutes = parse_minutes(td)
- title = parse_title(td)
- Program.new(start_time, minutes, title)
+ def parse_minutes(item, wday)
+ header_time = item.css('.dailyProgram-itemHeaderTime').text.strip
+ start_time, end_time = parse_header_time(header_time)
+ s = ProgramTime.parse(wday, start_time).next_on_air
+ e = ProgramTime.parse(wday, end_time).next_on_air
+ (e - s).floor / 60
end
- def parse_minutes(td)
- rowspan = td.attribute('rowspan')
- if !rowspan || rowspan.value.blank?
- 30
- else
- td.attribute('rowspan').value.to_i
- end
+ def parse_header_time(header_time)
+ header_time.scan(/([0-9]+:[0-9]+) .+ ([0-9]+:[0-9]+)/).first
end
- def parse_start_time(td, wday)
- ProgramTime.parse(wday, td.css('.time')[0].text)
+ def parse_start_time(item, wday)
+ header_time = item.css('.dailyProgram-itemHeaderTime').text.strip
+ start_time, _ = parse_header_time(header_time)
+ ProgramTime.parse(wday, start_time)
end
- def parse_title(td)
- [td.css('.title-p')[0].text, td.css('.rp')[0].text].select do |text|
- !text.gsub(/\s/, '').empty?
- end.map do |text|
- Moji.normalize_zen_han(text).strip
- end.join(' ')
+ def parse_title(item)
+ title = item.css('.dailyProgram-itemTitle').text.strip
+ personality = item
+ .css('.dailyProgram-itemPersonality')
+ .text
+ .strip
+ .gsub(' ', '')
+ .gsub(',', '_')
+ title += "_#{personality}" unless personality.empty?
+ Moji.normalize_zen_han(title)
end
end
end