« Example OOP PHP Code | Main | Jython servlet and JSP for AJAX slide presentations »
February 28, 2012
LOM OAI-PMH Harvester in Python
import oaipmh
import os
import re
import sys
import string
import time
import httplib
import urllib
import libxml2
from read_config import read_config
import do_sql
#from read_file import read_file
from MySQLdb.cursors import DictCursor
#from split import split
from string import split
import ConfigParser
import lom_handlers
import normalize_harvest
debug = 1
harvest = {}
repository_definitions_map = read_config('repository_definitions.ini', debug)
time.sleep(5)
# Namespaces to be stripped out from the about section and replaced with a single default one
# ie. oaiabout:
namespace_list = ['rdn_dc','dcterms','dc','meg','rdnterms']
identifier_list = []
harvest_type = ''
try:
repository_handle = sys.argv[1]
except:
MissingArgumentError = "\nUsage:\n\npython %s \n" % sys.argv[0]
raise MissingArgumentError
try:
harvest_type = sys.argv[2]
except:
MissingArgumentError = "\nUsage:\n\npython %s \n" % sys.argv[0]
raise MissingArgumentError
def get_last_harvest_date(repository_handle):
sql = "select max(date_format(harvest_date,'%Y-%m-%dT%H:%i:%sZ')) as last_harvest_date from harvests where harvest_type = 'daily' and repository_handle = '" + repository_handle + "'"
response = do_sql.do_sql_query('ltsn01_harvester',sql,debug)
do_sql.check_status(response, debug)
results = response['results']
return results[0]['last_harvest_date']
def get_xpath_map():
sql = 'select * from xpath_map'
response = do_sql.do_sql_query('ltsn01_harvester',sql,debug)
do_sql.check_status(response, debug)
results = response['results']
return results
def do_xpath_query(xml, xpath, label, identifier):
# We need to strip out namespaces from the about section
# for our xpath to work, as we will be adding ones back in
for ns in namespace_list:
xml = string.replace(xml, ns + ':','')
label = string.replace(label, ns + ':','')
xpath = string.replace(xpath, ns + ':','oaiabout:')
xpath = string.replace(xpath, '//about','//oaiabout:about')
final_dict = {}
parsed_doc = libxml2.parseDoc(xml)
if string.find(xpath, 'oaiabout') == -1:
#xpath_elements = split('/', xpath)
xpath_elements = split(xpath,'/')
xpath = ''
del xpath_elements[0]
del xpath_elements[0]
nspace = '/lom:'
for el in xpath_elements:
xpath = xpath + nspace + el
xpath = '/' + xpath
xpath = string.replace(xpath, 'lom:@','@')
xpath = string.replace(xpath, 'about:@','@')
if debug == 1:
print xpath
ctx = parsed_doc.xpathNewContext()
ctx.xpathRegisterNs("lom", "http://ltsc.ieee.org/xsd/LOMv1p0")
ctx.xpathRegisterNs("oaiabout", "http://www.openarchives.org/OAI/2.0/")
final_dict[label] = [node.content for node in ctx.xpathEval(xpath)]
parsed_doc.freeDoc()
if debug == 1:
print final_dict
return final_dict
def classification_parser(xml, debug):
# Special parser for the horribly nested classification section.
# In order to get the proper nesting we need three passes:
# 1) Grab the classification nodes
# 2) For each classification node parse the taxonPaths
# 3) For each taxonPath parse out the taxons
parsed_doc = libxml2.parseDoc(xml)
ctx = parsed_doc.xpathNewContext()
ctx.xpathRegisterNs("lom", "http://ltsc.ieee.org/xsd/LOMv1p0")
ctx.xpathRegisterNs("xsi", "http://www.w3.org/2001/XMLSchema-instance")
xpath="//lom:lom/lom:classification"
class_list = [ node for node in ctx.xpathEval(xpath)]
count = 0
node_values = []
for snode in class_list:
if debug == 1: print snode.serialize()
node_dict = {}
new_doc = libxml2.parseDoc(snode.serialize())
ctx = new_doc.xpathNewContext()
xpath="//classification/purpose/source"
node_dict['classification_purpose_source'] = [ node.content for node in ctx.xpathEval(xpath)]
xpath="//classification/purpose/value"
node_dict['classification_purpose_value'] = [ node.content for node in ctx.xpathEval(xpath)]
xpath="//classification/taxonPath"
# Serialize the node back into XML for parse once again :)
xml_path_list = [ node.serialize() for node in ctx.xpathEval(xpath)]
node_dict['classification_taxonPaths'] = []
for xml_path in xml_path_list:
path = {}
new_new_doc = libxml2.parseDoc(xml_path)
ctx = new_new_doc.xpathNewContext()
xpath="//taxonPath/source/string"
path['source_string'] = [ node.content for node in ctx.xpathEval(xpath)]
xpath="//taxonPath/taxon/id"
path['taxon_id'] = [ node.content for node in ctx.xpathEval(xpath)]
xpath="//taxonPath/taxon/entry/string"
path['taxon_entry_string'] = [ node.content for node in ctx.xpathEval(xpath)]
node_dict['classification_taxonPaths'].append(path)
node_values.append(node_dict)
count = count + 1
return node_values
if harvest_type == 'daily':
harvest_from = get_last_harvest_date(repository_handle)
else:
harvest_from = '2003-01-01T00:00:00Z'
lom_handlers.clear_tables(debug)
print "\n##Going to harvest the %s repository##\n" % repository_handle
try:
repository_definition = repository_definitions_map[repository_handle]
except:
BadHandleExceptionMessage = "Sorry there's no such repository definition as " + repository_handle + " in repository_definitions.ini:\n" + str(repository_defs.keys())
raise BadHandleExceptionMessage
repository_domain = repository_definition['repository_domain']
uri_path = repository_definition['uri_path']
repository_uri = "http://%s%s" % (repository_domain,uri_path)
sesh = oaipmh.ServerProxy(repository_uri)
server_id = sesh.identify()
print server_id.repositoryName()
print server_id.adminEmails()
metadata_formats = sesh.listMetadataFormats()
if debug == 1:
for format in metadata_formats:
print format
all_identifiers = sesh.listIdentifiers(metadataPrefix='oai_dc', from_='2003-01-01T00:00:00Z')
all_ids_list = []
for header in all_identifiers:
this_id = header.identifier()
all_ids_list.append(this_id)
identifiers = sesh.listIdentifiers(metadataPrefix='oai_dc', from_=harvest_from)
xpath_map = get_xpath_map()
print xpath_map
sys.exit
xpath_query_list = {}
for map in xpath_map:
xpath_query_list[map['xpath_query']] = 1
for header in identifiers:
this_record = {}
print "\n\n"
this_id = header.identifier()
identifier_list.append(this_id)
print "\n %s" % this_id
time.sleep(2)
#sesh = httplib.HTTPConnection(repository_domain)
#sesh.connect()
#sesh.request('GET',uri_path + '?verb=GetRecord&metadataPrefix=lom&identifier=' + this_id)
#response = sesh.getresponse()
response = urllib.urlopen(repository_uri + '?verb=GetRecord&metadataPrefix=lom&identifier=' + this_id)
xml = response.read()
# Fix for Nik's dodgy classification section
xml = string.replace(xml, 'taxonpath', 'taxonPath')
neat_xml = lom_handlers.mysql_tidy(xml)
reply = do_xpath_query(xml, '//lom/general/title/string', 'main_title', this_id)
main_title = lom_handlers.mysql_tidy(reply['main_title'][0])
sql = "insert into xml_store(oai_identifier,xml_document,record_main_title) values('%s','%s','%s')" % (this_id,neat_xml,main_title)
if debug == 1: print sql
response = do_sql.do_sql('ltsn01_harvester',sql,debug)
do_sql.check_status(response, debug)
for query in xpath_query_list.keys():
label = query
label = string.replace(label,'//','')
label = string.replace(label,'oaiabout:','')
label = string.replace(label,'/','_')
label = string.replace(label,'@','_')
reply = do_xpath_query(xml, query, label, this_id)
for ns in namespace_list:
label = string.replace(label, ns + ':','')
this_record[label] = reply[label]
if debug == 1:
print "\nThis record:\n", this_record
time.sleep(5)
lom_resource_id = lom_handlers.add_main_title(this_id, debug, this_record)
lom_handlers.add_title_added_entries(lom_resource_id, debug, this_record)
lom_handlers.add_keywords(lom_resource_id, debug, this_record)
lom_handlers.add_ISBNs(lom_resource_id, debug, this_record)
lom_handlers.store_poi(lom_resource_id, debug, this_record)
lom_handlers.lom_extra_languages(lom_resource_id, debug, this_record)
lom_handlers.add_general_value(lom_resource_id, debug, 'lom_general_description_string', 'general_description', this_record)
lom_handlers.add_general_value(lom_resource_id, debug, 'lom_general_coverage_string', 'general_coverage', this_record)
lom_handlers.add_general_value(lom_resource_id, debug, 'lom_technical_otherPlatformRequirements_string', 'technical_otherPlatformRequirements', this_record)
lom_handlers.add_authors(lom_resource_id, debug, this_record)
lom_handlers.add_about(lom_resource_id, debug, this_record)
lom_handlers.add_publishers(lom_resource_id, debug, this_record)
lom_handlers.add_locations(lom_resource_id, debug, this_record)
lom_handlers.add_formats(lom_resource_id, debug, this_record)
lom_handlers.add_lr_types(lom_resource_id, debug, this_record)
lom_handlers.add_educational_contexts(lom_resource_id, debug, this_record)
lom_handlers.add_general_value(lom_resource_id, debug, 'lom_educational_description_string', 'educational_description', this_record)
lom_handlers.add_general_value(lom_resource_id, debug, 'lom_rights_description_string', 'rights_description', this_record)
if this_record['lom_rights_copyrightAndOtherRestrictions_value']:
if this_record['lom_rights_copyrightAndOtherRestrictions_value'][0] == 'Yes' or this_record['lom_rights_copyrightAndOtherRestrictions_value'][0] == 'yes':
lom_handlers.add_set_value(lom_resource_id, debug, 1, 'rights_restricted')
lom_handlers.add_relations(lom_resource_id, debug, this_record)
class_list = []
class_list = classification_parser(xml, debug)
this_record['lom_classification'] = class_list
harvest[this_id] = this_record
if debug == 1:
print class_list
time.sleep(3)
lom_handlers.add_classification(lom_resource_id, debug, class_list)
# Put the entry in the harvests logging table
harvest_identifiers = string.join(identifier_list, '::')
all_ids = string.join(all_ids_list, '::')
if harvest_type == 'daily':
pass
else: harvest_type = 'complete'
sql = "insert into harvests(harvest_identifiers, all_identifiers, harvest_dictionary, harvest_type, repository_handle) values('" + harvest_identifiers + "','" + all_ids + "','" + lom_handlers.mysql_tidy(str(harvest)) + "','" + harvest_type + "','" + repository_handle + "')"
response = do_sql.do_sql('ltsn01_harvester',sql,debug)
do_sql.check_status(response, debug)
normalize_harvest.normalize_data(debug)
normalize_harvest.handle_deletions(repository_handle, debug)
Posted by pj at February 28, 2012 02:54 PM