Paul Hollands: LOM OAI-PMH Harvester in Python

« Example OOP PHP Code | Main | Jython servlet and JSP for AJAX slide presentations »

February 28, 2012

LOM OAI-PMH Harvester in Python

import oaipmh

import os

import re

import sys

import string

import time

import httplib

import urllib

import libxml2

from read_config import read_config

import do_sql

#from read_file import read_file

from MySQLdb.cursors import DictCursor

#from split import split
from string import split

import ConfigParser

import lom_handlers

import normalize_harvest

debug = 1

harvest = {}

repository_definitions_map = read_config('repository_definitions.ini', debug)

time.sleep(5)

# Namespaces to be stripped out from the about section and replaced with a single default one
# ie. oaiabout:

namespace_list = ['rdn_dc','dcterms','dc','meg','rdnterms']

identifier_list = []

harvest_type = ''

try:

	repository_handle =  sys.argv[1]

except:

        MissingArgumentError = "\nUsage:\n\npython %s  \n" % sys.argv[0]

        raise MissingArgumentError

try:

        harvest_type = sys.argv[2]

except:

        MissingArgumentError = "\nUsage:\n\npython %s  \n" % sys.argv[0]

        raise MissingArgumentError

def get_last_harvest_date(repository_handle):

	sql = "select max(date_format(harvest_date,'%Y-%m-%dT%H:%i:%sZ')) as last_harvest_date from harvests where harvest_type = 'daily' and repository_handle = '" + repository_handle + "'"

	response = do_sql.do_sql_query('ltsn01_harvester',sql,debug)

	do_sql.check_status(response, debug)

	results = response['results']

	return results[0]['last_harvest_date']

def get_xpath_map():

	sql = 'select * from xpath_map'

	response = do_sql.do_sql_query('ltsn01_harvester',sql,debug)
        
	do_sql.check_status(response, debug)

	results = response['results']

	return results

def do_xpath_query(xml, xpath, label, identifier):

	# We need to strip out namespaces from the about section
	# for our xpath to work, as we will be adding ones back in

	for ns in namespace_list:

		xml = string.replace(xml, ns + ':','')

		label = string.replace(label, ns + ':','')
		
		xpath = string.replace(xpath, ns + ':','oaiabout:')

	xpath = string.replace(xpath, '//about','//oaiabout:about')

	final_dict = {}

	parsed_doc = libxml2.parseDoc(xml)

	if string.find(xpath, 'oaiabout') == -1:

		#xpath_elements = split('/', xpath)
		xpath_elements = split(xpath,'/')

		xpath = ''

		del xpath_elements[0]

		del xpath_elements[0]

		nspace = '/lom:'

		for el in xpath_elements:

			xpath = xpath + nspace + el

		xpath = '/' + xpath

		xpath = string.replace(xpath, 'lom:@','@')

		xpath = string.replace(xpath, 'about:@','@')

	if debug == 1: 

		print xpath

	ctx = parsed_doc.xpathNewContext() 

	ctx.xpathRegisterNs("lom", "http://ltsc.ieee.org/xsd/LOMv1p0")

	ctx.xpathRegisterNs("oaiabout", "http://www.openarchives.org/OAI/2.0/")

	final_dict[label] = [node.content for node in ctx.xpathEval(xpath)] 

	parsed_doc.freeDoc()

	if debug == 1: 

		print final_dict

	return final_dict

def classification_parser(xml, debug):

	# Special parser for the horribly nested classification section.
	# In order to get the proper nesting we need three passes:
	# 1) Grab the classification nodes
	# 2) For each classification node parse the taxonPaths
	# 3) For each taxonPath parse out the taxons

	parsed_doc = libxml2.parseDoc(xml)

	ctx = parsed_doc.xpathNewContext()
                
	ctx.xpathRegisterNs("lom", "http://ltsc.ieee.org/xsd/LOMv1p0")

	ctx.xpathRegisterNs("xsi", "http://www.w3.org/2001/XMLSchema-instance")

	xpath="//lom:lom/lom:classification"

	class_list = [ node for node in ctx.xpathEval(xpath)]

	count = 0

	node_values = []

	for snode in class_list:

		if debug == 1: print snode.serialize()

		node_dict = {}

		new_doc = libxml2.parseDoc(snode.serialize())

		ctx = new_doc.xpathNewContext()

		xpath="//classification/purpose/source"

		node_dict['classification_purpose_source'] = [ node.content for node in ctx.xpathEval(xpath)]

		xpath="//classification/purpose/value"

		node_dict['classification_purpose_value'] = [ node.content for node in ctx.xpathEval(xpath)]

		xpath="//classification/taxonPath"

		# Serialize the node back into XML for parse once again :)

		xml_path_list = [ node.serialize() for node in ctx.xpathEval(xpath)]

		node_dict['classification_taxonPaths'] = []

		for xml_path in xml_path_list:

			path = {}

			new_new_doc = libxml2.parseDoc(xml_path)

			ctx = new_new_doc.xpathNewContext()

			xpath="//taxonPath/source/string"

			path['source_string'] = [ node.content for node in ctx.xpathEval(xpath)]

			xpath="//taxonPath/taxon/id"

			path['taxon_id'] = [ node.content for node in ctx.xpathEval(xpath)]

			xpath="//taxonPath/taxon/entry/string"

			path['taxon_entry_string'] = [ node.content for node in ctx.xpathEval(xpath)]

			node_dict['classification_taxonPaths'].append(path)		

		node_values.append(node_dict)

		count = count + 1

	return node_values

if harvest_type == 'daily':

	harvest_from = get_last_harvest_date(repository_handle)

else:

	harvest_from = '2003-01-01T00:00:00Z'

lom_handlers.clear_tables(debug)

print "\n##Going to harvest the %s repository##\n" % repository_handle

try:

	repository_definition = repository_definitions_map[repository_handle]

except:

	BadHandleExceptionMessage = "Sorry there's no such repository definition as " + repository_handle + " in repository_definitions.ini:\n" + str(repository_defs.keys())

	raise BadHandleExceptionMessage


repository_domain = repository_definition['repository_domain']

uri_path = repository_definition['uri_path']

repository_uri = "http://%s%s" % (repository_domain,uri_path)

sesh = oaipmh.ServerProxy(repository_uri)

server_id = sesh.identify()

print server_id.repositoryName()

print server_id.adminEmails()

metadata_formats = sesh.listMetadataFormats()

if debug == 1:

	for format in metadata_formats:

		print format

all_identifiers = sesh.listIdentifiers(metadataPrefix='oai_dc', from_='2003-01-01T00:00:00Z')

all_ids_list = []

for header in all_identifiers:

	this_id = header.identifier()

        all_ids_list.append(this_id)

identifiers = sesh.listIdentifiers(metadataPrefix='oai_dc', from_=harvest_from)

xpath_map = get_xpath_map()

print xpath_map

sys.exit

xpath_query_list = {}

for map in xpath_map:

	xpath_query_list[map['xpath_query']] = 1

for header in identifiers:

	this_record = {}

	print "\n\n"

	this_id = header.identifier()

	identifier_list.append(this_id)

	print "\n %s" % this_id

	time.sleep(2)

	#sesh = httplib.HTTPConnection(repository_domain)

	#sesh.connect()

	#sesh.request('GET',uri_path + '?verb=GetRecord&metadataPrefix=lom&identifier=' + this_id)

	#response = sesh.getresponse()

	response = urllib.urlopen(repository_uri + '?verb=GetRecord&metadataPrefix=lom&identifier=' + this_id)

	xml = response.read()	

	# Fix for Nik's dodgy classification section

	xml = string.replace(xml, 'taxonpath', 'taxonPath')

	neat_xml = lom_handlers.mysql_tidy(xml)

	reply = do_xpath_query(xml, '//lom/general/title/string', 'main_title', this_id)

	main_title = lom_handlers.mysql_tidy(reply['main_title'][0])

	sql = "insert into xml_store(oai_identifier,xml_document,record_main_title) values('%s','%s','%s')" % (this_id,neat_xml,main_title)

	if debug == 1: print sql

	response = do_sql.do_sql('ltsn01_harvester',sql,debug)
        
	do_sql.check_status(response, debug)

	for query in xpath_query_list.keys():

		label = query

		label = string.replace(label,'//','')

		label = string.replace(label,'oaiabout:','')

		label = string.replace(label,'/','_')

		label = string.replace(label,'@','_')

		reply = do_xpath_query(xml, query, label, this_id)

		for ns in namespace_list:

			label = string.replace(label, ns + ':','')

		this_record[label] = reply[label]

	if debug == 1: 

		print "\nThis record:\n", this_record

		time.sleep(5)

	lom_resource_id = lom_handlers.add_main_title(this_id, debug, this_record)

	lom_handlers.add_title_added_entries(lom_resource_id, debug, this_record)

	lom_handlers.add_keywords(lom_resource_id, debug, this_record)

	lom_handlers.add_ISBNs(lom_resource_id, debug, this_record)

	lom_handlers.store_poi(lom_resource_id, debug, this_record)

	lom_handlers.lom_extra_languages(lom_resource_id, debug, this_record)

	lom_handlers.add_general_value(lom_resource_id, debug, 'lom_general_description_string', 'general_description', this_record)

	lom_handlers.add_general_value(lom_resource_id, debug, 'lom_general_coverage_string', 'general_coverage', this_record)

	lom_handlers.add_general_value(lom_resource_id, debug, 'lom_technical_otherPlatformRequirements_string', 'technical_otherPlatformRequirements', this_record)

	lom_handlers.add_authors(lom_resource_id, debug, this_record)

	lom_handlers.add_about(lom_resource_id, debug, this_record)

	lom_handlers.add_publishers(lom_resource_id, debug, this_record)

	lom_handlers.add_locations(lom_resource_id, debug, this_record)

	lom_handlers.add_formats(lom_resource_id, debug, this_record)

	lom_handlers.add_lr_types(lom_resource_id, debug, this_record)

	lom_handlers.add_educational_contexts(lom_resource_id, debug, this_record)

	lom_handlers.add_general_value(lom_resource_id, debug, 'lom_educational_description_string', 'educational_description', this_record)

	lom_handlers.add_general_value(lom_resource_id, debug, 'lom_rights_description_string', 'rights_description', this_record)

	if this_record['lom_rights_copyrightAndOtherRestrictions_value']:

		if this_record['lom_rights_copyrightAndOtherRestrictions_value'][0] == 'Yes' or this_record['lom_rights_copyrightAndOtherRestrictions_value'][0] == 'yes':

			lom_handlers.add_set_value(lom_resource_id, debug, 1, 'rights_restricted')

	lom_handlers.add_relations(lom_resource_id, debug, this_record)

	class_list = []

	class_list = classification_parser(xml, debug)

	this_record['lom_classification'] = class_list

	harvest[this_id] = this_record

	if debug == 1: 

		print class_list

		time.sleep(3)

	lom_handlers.add_classification(lom_resource_id, debug, class_list)

# Put the entry in the harvests logging table

harvest_identifiers = string.join(identifier_list, '::')

all_ids = string.join(all_ids_list, '::')

if harvest_type == 'daily':

	pass

else: harvest_type = 'complete'

sql = "insert into harvests(harvest_identifiers, all_identifiers, harvest_dictionary, harvest_type, repository_handle) values('" + harvest_identifiers + "','" + all_ids + "','" + lom_handlers.mysql_tidy(str(harvest)) + "','" + harvest_type + "','" + repository_handle  + "')"

response = do_sql.do_sql('ltsn01_harvester',sql,debug)

do_sql.check_status(response, debug)

normalize_harvest.normalize_data(debug)

normalize_harvest.handle_deletions(repository_handle, debug)

Posted by pj at February 28, 2012 02:54 PM

Comments