Skip to content

Commit

Permalink
Make committee IDs distinct, add divnumber and trim changedates.txt
Browse files Browse the repository at this point in the history
git-svn-id: http://project.knowledgeforge.net/ukparse/svn/trunk@10715 9cc54934-f9f6-0310-8a8d-e8e977684441
  • Loading branch information
mark committed Oct 18, 2013
1 parent 0eb91ce commit bd4efc5
Showing 1 changed file with 7 additions and 3 deletions.
10 changes: 7 additions & 3 deletions pyscraper/sp/parse-official-reports-new.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,7 @@ def suggested_file_name(self):
return "%s/%s_%d.xml" % (self.normalized_session_name, self.report_date, self.page_id)

def as_xml(self):
base_id = "uk.org.publicwhip/spor2/"
base_id = "uk.org.publicwhip/spor/"
if self.normalized_session_name not in ('plenary', 'meeting-of-the-parliament'):
base_id += self.normalized_session_name + "/"
base_id += str(self.report_date)
Expand Down Expand Up @@ -429,9 +429,10 @@ def tidy_speeches(self):

class Division(object):

def __init__(self, report_date, url, candidate=None, candidate_id=None):
def __init__(self, report_date, url, divnumber, candidate=None, candidate_id=None):
self.report_date = report_date
self.url = url
self.divnumber = divnumber
self.votes = {}
self.candidate = candidate
self.candidate_id = candidate_id
Expand All @@ -453,6 +454,7 @@ def as_xml(self, division_id):
attributes = {'url': self.url,
'divdate': str(self.report_date),
'nospeaker': "True",
'divnumber': str(self.divnumber),
'id': division_id}
if self.candidate:
attributes['candidate'] = self.candidate
Expand Down Expand Up @@ -543,6 +545,7 @@ def quick_parse_html(filename, page_id, original_url):
return (session, report_date, soup)

def parse_html(session, report_date, soup, page_id, original_url):
divnumber = 0
report_view = soup.find('div', attrs={'id': 'ReportView'})
div_children_of_report_view = report_view.findChildren('div', recursive=False)
if len(div_children_of_report_view) != 1:
Expand Down Expand Up @@ -652,7 +655,8 @@ def parse_html(session, report_date, soup, page_id, original_url):
parsed_page.sections[-1].speeches_and_votes.append(current_speech)
current_speech.paragraphs.append(tidied_paragraph)
if (not current_votes) or (current_votes.candidate != division_candidate):
current_votes = Division(report_date, current_url, division_candidate, division_candidate_id)
current_votes = Division(report_date, current_url, divnumber, division_candidate, division_candidate_id)
divnumber += 1
parsed_page.sections[-1].speeches_and_votes.append(current_votes)
current_division_way = division_way
elif member_vote:
Expand Down

0 comments on commit bd4efc5

Please sign in to comment.