« Example OOP PHP Code | Main | Jython servlet and JSP for AJAX slide presentations »
February 28, 2012
LOM OAI-PMH Harvester in Python
import oaipmh import os import re import sys import string import time import httplib import urllib import libxml2 from read_config import read_config import do_sql #from read_file import read_file from MySQLdb.cursors import DictCursor #from split import split from string import split import ConfigParser import lom_handlers import normalize_harvest debug = 1 harvest = {} repository_definitions_map = read_config('repository_definitions.ini', debug) time.sleep(5) # Namespaces to be stripped out from the about section and replaced with a single default one # ie. oaiabout: namespace_list = ['rdn_dc','dcterms','dc','meg','rdnterms'] identifier_list = [] harvest_type = '' try: repository_handle = sys.argv[1] except: MissingArgumentError = "\nUsage:\n\npython %s\n" % sys.argv[0] raise MissingArgumentError try: harvest_type = sys.argv[2] except: MissingArgumentError = "\nUsage:\n\npython %s \n" % sys.argv[0] raise MissingArgumentError def get_last_harvest_date(repository_handle): sql = "select max(date_format(harvest_date,'%Y-%m-%dT%H:%i:%sZ')) as last_harvest_date from harvests where harvest_type = 'daily' and repository_handle = '" + repository_handle + "'" response = do_sql.do_sql_query('ltsn01_harvester',sql,debug) do_sql.check_status(response, debug) results = response['results'] return results[0]['last_harvest_date'] def get_xpath_map(): sql = 'select * from xpath_map' response = do_sql.do_sql_query('ltsn01_harvester',sql,debug) do_sql.check_status(response, debug) results = response['results'] return results def do_xpath_query(xml, xpath, label, identifier): # We need to strip out namespaces from the about section # for our xpath to work, as we will be adding ones back in for ns in namespace_list: xml = string.replace(xml, ns + ':','') label = string.replace(label, ns + ':','') xpath = string.replace(xpath, ns + ':','oaiabout:') xpath = string.replace(xpath, '//about','//oaiabout:about') final_dict = {} parsed_doc = libxml2.parseDoc(xml) if string.find(xpath, 'oaiabout') == -1: #xpath_elements = split('/', xpath) xpath_elements = split(xpath,'/') xpath = '' del xpath_elements[0] del xpath_elements[0] nspace = '/lom:' for el in xpath_elements: xpath = xpath + nspace + el xpath = '/' + xpath xpath = string.replace(xpath, 'lom:@','@') xpath = string.replace(xpath, 'about:@','@') if debug == 1: print xpath ctx = parsed_doc.xpathNewContext() ctx.xpathRegisterNs("lom", "http://ltsc.ieee.org/xsd/LOMv1p0") ctx.xpathRegisterNs("oaiabout", "http://www.openarchives.org/OAI/2.0/") final_dict[label] = [node.content for node in ctx.xpathEval(xpath)] parsed_doc.freeDoc() if debug == 1: print final_dict return final_dict def classification_parser(xml, debug): # Special parser for the horribly nested classification section. # In order to get the proper nesting we need three passes: # 1) Grab the classification nodes # 2) For each classification node parse the taxonPaths # 3) For each taxonPath parse out the taxons parsed_doc = libxml2.parseDoc(xml) ctx = parsed_doc.xpathNewContext() ctx.xpathRegisterNs("lom", "http://ltsc.ieee.org/xsd/LOMv1p0") ctx.xpathRegisterNs("xsi", "http://www.w3.org/2001/XMLSchema-instance") xpath="//lom:lom/lom:classification" class_list = [ node for node in ctx.xpathEval(xpath)] count = 0 node_values = [] for snode in class_list: if debug == 1: print snode.serialize() node_dict = {} new_doc = libxml2.parseDoc(snode.serialize()) ctx = new_doc.xpathNewContext() xpath="//classification/purpose/source" node_dict['classification_purpose_source'] = [ node.content for node in ctx.xpathEval(xpath)] xpath="//classification/purpose/value" node_dict['classification_purpose_value'] = [ node.content for node in ctx.xpathEval(xpath)] xpath="//classification/taxonPath" # Serialize the node back into XML for parse once again :) xml_path_list = [ node.serialize() for node in ctx.xpathEval(xpath)] node_dict['classification_taxonPaths'] = [] for xml_path in xml_path_list: path = {} new_new_doc = libxml2.parseDoc(xml_path) ctx = new_new_doc.xpathNewContext() xpath="//taxonPath/source/string" path['source_string'] = [ node.content for node in ctx.xpathEval(xpath)] xpath="//taxonPath/taxon/id" path['taxon_id'] = [ node.content for node in ctx.xpathEval(xpath)] xpath="//taxonPath/taxon/entry/string" path['taxon_entry_string'] = [ node.content for node in ctx.xpathEval(xpath)] node_dict['classification_taxonPaths'].append(path) node_values.append(node_dict) count = count + 1 return node_values if harvest_type == 'daily': harvest_from = get_last_harvest_date(repository_handle) else: harvest_from = '2003-01-01T00:00:00Z' lom_handlers.clear_tables(debug) print "\n##Going to harvest the %s repository##\n" % repository_handle try: repository_definition = repository_definitions_map[repository_handle] except: BadHandleExceptionMessage = "Sorry there's no such repository definition as " + repository_handle + " in repository_definitions.ini:\n" + str(repository_defs.keys()) raise BadHandleExceptionMessage repository_domain = repository_definition['repository_domain'] uri_path = repository_definition['uri_path'] repository_uri = "http://%s%s" % (repository_domain,uri_path) sesh = oaipmh.ServerProxy(repository_uri) server_id = sesh.identify() print server_id.repositoryName() print server_id.adminEmails() metadata_formats = sesh.listMetadataFormats() if debug == 1: for format in metadata_formats: print format all_identifiers = sesh.listIdentifiers(metadataPrefix='oai_dc', from_='2003-01-01T00:00:00Z') all_ids_list = [] for header in all_identifiers: this_id = header.identifier() all_ids_list.append(this_id) identifiers = sesh.listIdentifiers(metadataPrefix='oai_dc', from_=harvest_from) xpath_map = get_xpath_map() print xpath_map sys.exit xpath_query_list = {} for map in xpath_map: xpath_query_list[map['xpath_query']] = 1 for header in identifiers: this_record = {} print "\n\n" this_id = header.identifier() identifier_list.append(this_id) print "\n %s" % this_id time.sleep(2) #sesh = httplib.HTTPConnection(repository_domain) #sesh.connect() #sesh.request('GET',uri_path + '?verb=GetRecord&metadataPrefix=lom&identifier=' + this_id) #response = sesh.getresponse() response = urllib.urlopen(repository_uri + '?verb=GetRecord&metadataPrefix=lom&identifier=' + this_id) xml = response.read() # Fix for Nik's dodgy classification section xml = string.replace(xml, 'taxonpath', 'taxonPath') neat_xml = lom_handlers.mysql_tidy(xml) reply = do_xpath_query(xml, '//lom/general/title/string', 'main_title', this_id) main_title = lom_handlers.mysql_tidy(reply['main_title'][0]) sql = "insert into xml_store(oai_identifier,xml_document,record_main_title) values('%s','%s','%s')" % (this_id,neat_xml,main_title) if debug == 1: print sql response = do_sql.do_sql('ltsn01_harvester',sql,debug) do_sql.check_status(response, debug) for query in xpath_query_list.keys(): label = query label = string.replace(label,'//','') label = string.replace(label,'oaiabout:','') label = string.replace(label,'/','_') label = string.replace(label,'@','_') reply = do_xpath_query(xml, query, label, this_id) for ns in namespace_list: label = string.replace(label, ns + ':','') this_record[label] = reply[label] if debug == 1: print "\nThis record:\n", this_record time.sleep(5) lom_resource_id = lom_handlers.add_main_title(this_id, debug, this_record) lom_handlers.add_title_added_entries(lom_resource_id, debug, this_record) lom_handlers.add_keywords(lom_resource_id, debug, this_record) lom_handlers.add_ISBNs(lom_resource_id, debug, this_record) lom_handlers.store_poi(lom_resource_id, debug, this_record) lom_handlers.lom_extra_languages(lom_resource_id, debug, this_record) lom_handlers.add_general_value(lom_resource_id, debug, 'lom_general_description_string', 'general_description', this_record) lom_handlers.add_general_value(lom_resource_id, debug, 'lom_general_coverage_string', 'general_coverage', this_record) lom_handlers.add_general_value(lom_resource_id, debug, 'lom_technical_otherPlatformRequirements_string', 'technical_otherPlatformRequirements', this_record) lom_handlers.add_authors(lom_resource_id, debug, this_record) lom_handlers.add_about(lom_resource_id, debug, this_record) lom_handlers.add_publishers(lom_resource_id, debug, this_record) lom_handlers.add_locations(lom_resource_id, debug, this_record) lom_handlers.add_formats(lom_resource_id, debug, this_record) lom_handlers.add_lr_types(lom_resource_id, debug, this_record) lom_handlers.add_educational_contexts(lom_resource_id, debug, this_record) lom_handlers.add_general_value(lom_resource_id, debug, 'lom_educational_description_string', 'educational_description', this_record) lom_handlers.add_general_value(lom_resource_id, debug, 'lom_rights_description_string', 'rights_description', this_record) if this_record['lom_rights_copyrightAndOtherRestrictions_value']: if this_record['lom_rights_copyrightAndOtherRestrictions_value'][0] == 'Yes' or this_record['lom_rights_copyrightAndOtherRestrictions_value'][0] == 'yes': lom_handlers.add_set_value(lom_resource_id, debug, 1, 'rights_restricted') lom_handlers.add_relations(lom_resource_id, debug, this_record) class_list = [] class_list = classification_parser(xml, debug) this_record['lom_classification'] = class_list harvest[this_id] = this_record if debug == 1: print class_list time.sleep(3) lom_handlers.add_classification(lom_resource_id, debug, class_list) # Put the entry in the harvests logging table harvest_identifiers = string.join(identifier_list, '::') all_ids = string.join(all_ids_list, '::') if harvest_type == 'daily': pass else: harvest_type = 'complete' sql = "insert into harvests(harvest_identifiers, all_identifiers, harvest_dictionary, harvest_type, repository_handle) values('" + harvest_identifiers + "','" + all_ids + "','" + lom_handlers.mysql_tidy(str(harvest)) + "','" + harvest_type + "','" + repository_handle + "')" response = do_sql.do_sql('ltsn01_harvester',sql,debug) do_sql.check_status(response, debug) normalize_harvest.normalize_data(debug) normalize_harvest.handle_deletions(repository_handle, debug)
Posted by pj at February 28, 2012 02:54 PM