diff --git a/generate.py b/generate.py index aaf1a2a..ced1fb4 100755 --- a/generate.py +++ b/generate.py @@ -8,23 +8,44 @@ THIS IS A WORK IN PROGRESS; the HTML on the LoC pages this scrapes is irregular. Most of the work needs to be done on subfield parsing. + You'll need to have python, lxml.etree, pyquery, and requests installed -before running this script. +before running this script. Developed in Python 2.7.3 Questions or comments welcome in #code4lib on -irc.freenode.net. +irc.freenode.net. Adam Constabaris (ajconsta) is to blame. Inspired by the original by Ed Summers. + +The MIT License (MIT) + +Copyright (c) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. """ import requests +from itertools import groupby from pyquery.pyquery import PyQuery from collections import OrderedDict -from urlparse import urljoin -import hashlib -import re -import os, sys -import json +from urlparse import urljoin, urlparse +import re, os, sys, json def normalize(input_string): return input_string and re.sub(r"\s+"," ",input_string).strip() or input_string @@ -34,6 +55,14 @@ def two_split(input_string,delimiter='-'): """splits a string into at most two parts and normalizes each part""" return [ normalize(x) for x in input_string.split('-',1) ] +def lister(groupie): + """Need something a bit more sophisticated than list(itertools._grouper) because + the "values" (which are generatory) get consumed when you do that.""" + rv = [] + for k, v in groupie: + rv.append((k, list(v))) + return rv + class Crawler(object): """Crawls the concise MARC info pages on the LoC site. @@ -42,29 +71,34 @@ class Crawler(object): you'd like. """ + start_url = 'http://www.loc.gov/marc/bibliographic/ecbdhome.html' + CONTROL_FIELDS = set(('001','003', '005', '006', '007', '008')) + # tag, name, repeatability field_basics_re = re.compile(r"^\s*([0-9]{3})\s*-\s*(.*)\s+\((N?R)\)$") - subfield_re = re.compile(r"^\s*\$(.-?.?)\s+-\s+(.*)(?:\s*\((N?R)\))?$") + subfield_re = re.compile(r"^\s*\$(.-?.?)\s+-\s+([^(]+)\s*(?:\((N?R)\))?$") + #re.compile(r"^\s*\$(.-?.?)\s+-\s+(.*)\s*(\((N?R)\))?$") - def __init__(self,cacher=None): + def __init__(self,cacher=None, start_url=None): """ Creates a new instance. If no cacher is specified, builds a default instance. @param cacher a page caching fetcher. """ if cacher is None: cacher = Cacher() - self.cacher = cacher - self.start_url = 'http://www.loc.gov/marc/bibliographic/ecbdhome.html' + self.cacher = cacher + if start_url is not None: + self.start_url = start_url def __iter__(self): urls = self.get_bibliographic_urls() for url in urls: links = self.get_concise_pages(url) for link in links: - td = self.get_tag_data(link) + td = self.get_field_data(link) if td: yield td @@ -83,7 +117,7 @@ def extract_title_text(self,h1): return m.group(1), m.group(2), m.group(3) == 'R' return False - def get_tag_data(self,url): + def get_field_data(self,url): """ Fetches the data from the URL and tries to extract all of the tag information from the page. @@ -94,7 +128,7 @@ def get_tag_data(self,url): or False if information cannot be extracted from the page at url """ dom = self.get_dom(url) - tag_info = self.get_field_def(dom) + tag_info = self.get_tag_def(dom) if tag_info: tag, title, repeatable = tag_info else: @@ -104,7 +138,10 @@ def get_tag_data(self,url): definition = dom("p").eq(0) if not definition.size(): definition = PyQuery("

Bad HTML: %s

" % url) - if tag not in self.CONTROL_FIELDS: + control_field = tag in self.CONTROL_FIELDS + definition = normalize(definition.text()) + data = dict(title=title,definition=definition,repeatable=repeatable,control_field=control_field) + if not control_field: subfields = self.get_subfields(dom) if '?' in subfields: raise Exception("can't parse subfields in " + url) @@ -115,32 +152,33 @@ def get_tag_data(self,url): traceback.print_exception(*sys.exc_info()) print e raise Exception("Can't get indicators from " + url, e) - else: - subfields = () - indicators = () - - return tag, dict(title=title,definition=normalize(definition.text()),repeatable=repeatable, subfields=subfields,indicators=indicators) - - + data['indicators'] = indicators + data['subfields'] = subfields + return tag, data + def get_subfields(self,dom): rv = OrderedDict() values = dom("body > div.subfieldvalue") - def handler(idx,pel): - pel = PyQuery(pel) - txt = re.sub(r"\n+", " ", pel.text()) + if values.size() == 0: + return self._subfield_dl(dom) + + def handler(idx,el): + pel = PyQuery(el) + txt = normalize(el.text) m = self.subfield_re.match(txt) if m: - sf = m.group(1).strip() - desc = m.group(2).strip() + sf = normalize(m.group(1)) + defn = normalize(m.group(2)) if len(m.groups()) > 2: repeatability = m.group(3) == 'R' else: repeatability = None else: + sys.stderr.write(unicode(pel)) sys.stderr.write("<<<" + txt + ">>>") - sf,desc,repeatability = "?", "?", False + sf,defn,repeatability = "?", "?", False extra = [ x.text for x in pel("div.description") if x.text is not None ] - rv[sf] = dict(description=desc,definition=len(extra) > 0 and normalize(" ".join(extra)) or "") + rv[sf] = dict(definition=defn,description=len(extra) > 0 and normalize(" ".join(extra)) or "") if repeatability is not None: rv[sf]['repeatable'] = repeatability else: @@ -159,27 +197,39 @@ def parse_indicator(self,dom): desc = dom.eq(0)("div.description") desc = desc.size() > 0 and normalize(desc.eq(0).text()) or "" return dict(definition=definition,values=values,description=desc) + + def _subfield_dl(self,dom): + defs = dom("div.subfields dl > dt") + sfs = OrderedDict() + for d in defs: + m = self.subfield_re.match(d.text) + if m: + sf = normalize(m.group(1)) + defn = normalize(m.group(2)) + rpt = m.group(3) == 'R' + sfs[sf] = OrderedDict(definition=defn,repeatable=rpt) + return sfs + + + def _indicator_dl(self,dom): - """024 puts indicators in a nicer, but still outlier, HTML + """Some 'newer' pages put indicators in a somewhat more structured HTML definition list""" dl = dom("div.indicators dl") + groups = lister(groupby(dl[0], lambda x : x.tag == 'dt' and 'def' or 'values')) + # now looks like [ ('def',(def element)), ('values', (value elements)) ] x 2 inds = [] - values = OrderedDict() - definition = "" - for el in dl: - if el.tag == 'dt': - if values: - inds.append(dict(definition=definition,values=values)) - values = OrderedDict() - definition = el.text.split("-")[1].strip() - elif el.tag == 'dd': - k,v = two_split(el.text) - values[k] = v - if len(inds) < 2: - inds.append(dict(definition=definition,values=values) ) + for i in (0,2): + defn = groups[i][1][0].text + vals = groups[i+1][1] + inds.append(OrderedDict( + definition=two_split(defn)[1], + values= OrderedDict([ two_split(x.text) for x in vals]) + ) + ) return inds - + def get_indicators(self,dom): tli = dom("body div.indicatorvalue") if tli.size() >= 2: @@ -190,7 +240,7 @@ def get_indicators(self,dom): return self._indicator_dl(dom) return ({},{}) - def get_field_def(self,dom): + def get_tag_def(self,dom): h1 = dom("h1") if not h1: sys.stderr.write("oops %s " % url) @@ -246,8 +296,8 @@ def fetch_text(self,url): latter, creates a file in self.cache_dir using the md5sum of the URL @param url -- the URL to fetch """ - digest = hashlib.md5(url).hexdigest() - pth = os.path.join(self.cache_dir, digest + ".html") + filename = urlparse(url).path.replace("/", "_") + pth = os.path.join(self.cache_dir, filename) if os.path.isfile(pth) and os.path.getsize(pth) > 0: with open(pth) as thefile: return thefile.read() @@ -263,6 +313,11 @@ def fetch_text(self,url): output.write(bytes) return bytes +class CacherNoCaching(Cacher): + """For when only fresh data will do!""" + def fetch_text(self,url): + return requests.get(url).text + # ok start your engines if __name__ == '__main__':