Paul Hollands: Python MARC record cleaner

April 20, 2015

Python MARC record cleaner

# -*- coding: UTF-8 -*-

from pymarc import MARCReader
from pymarc import Record, Field
#from abc import ABCMeta, abstractmethod

import re, string, os, json
import sys, time
from datetime import date

"""
Class for printing out colours on the CLI
"""

class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
	
"""
Class for when we don't want colours (monkeys with 'more' use and puts garbage into files)
"""
class nocolors:
    HEADER = ''
    OKBLUE = ''
    OKGREEN = ''
    WARNING = ''
    FAIL = ''
    ENDC = ''
    BOLD = ''
    UNDERLINE = ''

class ChangeLogger:
	
	"""
	Class for delegating logging functions to.
	
	Returns the log as JSON.
	"""
	
	def __init__(self):
		try:
			testit = "%s" % self.list
		except:
			self.set_list()
	
	# Set up the log list
	def set_list(self):
		self.list = {}
	
	# Main function to add to the log
	def log(self, leader, field, message):
		if leader in self.list:
			blah = 1
		else:
			self.list[leader] = {}
				 
		try:
			self.list[leader][field].append(message)

		except:
			self.list[leader][field] = []
			self.list[leader][field].append(message)
	
	# Function to return the log as JSON				
	def get_log(self, leader):
			return json.dumps(self.list[leader])

class MARCParser():
	
	"""
	Generic (maybe later abstract) class with main MARC parser functions.
	"""
	
	def __init__(self):
		
		"""
		Instantiate the logger delegate object our colour writer and the deletions array.
		"""
		self.logger = ChangeLogger()
		
		self.bcolors = nocolors()
		
		self.deletions = ['520','533','655','776','505','500','588','590','980']
		
		self.additions = [{'tag' : "909", 'value' : "Coutts MARC record for shelf-ready print. Cleaned PH %s" % date.today().isoformat()}]
		
		self.sub_deletions = [{'tag' : '245', 'sub' : 'h'}, {'tag' : '490', 'sub' : 'x'}]
		
		self.format = "print"
			
	def process_record(self, record):
			
		"""
		Function for processing each record. This should be moved to the concrete classes only (later).
		"""
		
		# Important or printing using as_marc() otherwise you getting an encoding error 
		
		record.force_utf8 = 1
		
		#title = record['245']['a'].strip(' \r\n\t\/\:')
		
		if self.format == "print": self.deletions.append('856')
		
		# Print the leader for comparison *before* we amend it.
		
		#print self.bcolors.HEADER + "%s" % title + self.bcolors.ENDC
		
		print self.bcolors.HEADER + "%s" % record.leader + self.bcolors.ENDC
		
		# Amend the leaders for additions
		
		record.leader = string.replace(record.leader, 'nam', 'nad')
		record.leader = string.replace(record.leader, 'cam', 'nad')
		
		self.log(record, 'LDR', "Replaced cam/nam with nad in the Leader")
		
		# Print out the record before we start to amend it using the yellow WARNING colour:
		
		for field in record.get_fields():
				if not field == None: print self.bcolors.WARNING + "%s" % field + self.bcolors.ENDC
		
		# Tidy the call number in 082 to remove any separateor characters ' /
		
		record = self.clean_call_number(record)

		# Fix any editions entries and turn into an ordinal (e.g. 2nd) if necessary.
		
		record = self.fix_editions(record)
		
		# Remove any non-LCSH subject headings based on the value in the second sub-field indicator.
		
		for i in ['1','2','3','4','5','6','7']:
				record = self.nuke_field_based_on_indicator(record, '650', 2, i)
				
		# Remove copyright statement in 264 where the second indicator is a 4
				
		record = self.nuke_field_based_on_indicator(record, '264', 2, '4')
		
		# Add any additional fields as defined in our hash set in the __init__ function
		
		for i in self.additions:
			record = self.add_simple_field(record, i['tag'], i['value'])
		
		# Delete those sub-fields defined in our sub_deletions array. See __init__
		
		for i in self.sub_deletions:
			record = self.delete_subfield(record,  i['tag'], i['sub'])
		
		# Delete those whole fields defined in our deletions array. See __init__
		
		for deli in self.deletions:
				record = self.delete_field_by_tag(record, deli)
		
		# If we have an eBook set, process the 856s
		
		if self.format == 'ebooks':
			record = self.set_link_text(record)
			record = self.add_ezproxy(record)
		
		# Move any 440s into 490s
		
		record = self.switch_tag(record, '440', '490')
	
		print " "
		
		# Print out the processed record in white
		
		print self.bcolors.HEADER + "%s" % record.leader + self.bcolors.ENDC
		for field in record.get_fields():
				if not field == None: print "%s" % field
				
		print " "
		
		# Print out the log
		
		print self.bcolors.OKGREEN + self.logger.get_log(record.leader) + self.bcolors.ENDC
		
		print " "
		
		# Priint out the raw MARC
		
		print self.bcolors.OKBLUE + record.as_marc() + self.bcolors.ENDC
	
		return record
	
	def switch_tag(self, record, old, new):
	
		try:
			fields = record.get_fields(old)
		
		except:
			return record
		
		for field in fields:
			field.tag = new
		
		if(len(fields) > 0): self.log(record, old, "Switched contents of %s to %s." % (old, new))
		
		return record
	
	def add_ezproxy(self, record):
	
		"""
		Add the EZProxy prefix to the URL in 856 $u
		"""
		try:
			fields = record.get_fields('856')
		
			url = record['856']['u']
		
		except:
			return record
		
		record = self.delete_subfield(record, '856', 'u')
		
		for field in fields:
			field.subfields.append('u')
			field.subfields.append("http://ezproxy.lib.le.ac.uk/login?url=%s" % url)
			
		self.log(record, '856', "Prefixed URL with EZPRoxy.")
	
		return record
	
	def set_link_text(self, record):
	
		"""
		Add proper link text for hyperlink into 856 $z
		"""
	
		record = self.delete_subfield(record, '856', 'z')
	
		fields = record.get_fields('856')
		
		title = record['245']['a'].strip(' \r\n\t\/\:')
		
		for field in fields:
			# Override 
			field.indicators = ['4','0']
			field.subfields.append('z')
			field.subfields.append("Access the eBook \"%s\"" % title)
		
		self.log(record, '856', "Set the link text in 856 $z: Access the eBook \"%s\"" % title)
		
		return record
	
	def add_simple_field(self, record, tagl, value):
	
		"""
		Add a new field with just a $a subfield and no indicators
		"""
		
		self.log(record, tagl, "Added new field %s with value of: %s" % (tagl, value))
		
		subs = ['a', value]
		
		record.add_field(Field(
		tag = tagl,
		indicators = [' ', ' '],
		subfields = subs,
		))
		
		return record

	def delete_subfield(self, record, tagl, del_sub):
	
		"""
		Turn the array into a hash so we can nuke by key. Better (dust) better. Yeah.
		"""
		
		self.log(record, tagl, "Attempting to remove %s subfields from %s" % (del_sub, tagl))
		
		final = []
	
		for field in record.get_fields(tagl):
			hash = self.subfields_to_dict(field.subfields)
			
			for key in sorted(hash.keys()):
				if not key == del_sub:
					final.append(key)
					final.append(hash[key])
		
			#dicators = field.indicators
		
			#record.remove_field(field)
		
			"""record.add_field(Field(
				tag = tagl,
				indicators = dicators,
				subfields = final,
				))"""
				
			field.subfields = final
		
		return record
	
	def subfields_to_dict(self, l):
	
		return dict(zip(l[::2], l[1::2]))
	
	def delete_field_by_tag(self, record, tag):
			
		"""
		Pass the record and a tag and delete to field from the record
		"""
		try:
			fields = record.get_fields(tag)
				
		except:
			self.log(record, tag, "Tried deleting field(s) %s. None found." % tag)
			return record
		
		for field in fields:
			self.log(record, tag, "Deleted field(s) %s: %s" % (tag, field))
			record.remove_field(field)

		return record

	def clean_call_number(self, record):

		"""
		Strip separators such as ' and / from the call number in the 082 field
		"""

		try:
			field = record['082']

		except:
			return record

		try:
			subs = field.subfields
		except:
			return record

		#print "%s" % subs

		for strip in ["'","/"]:
			if strip in subs[1]:
				self.log(record, '082', "Found and stripped %s in 082" % strip)
				subs[1] = string.replace(subs[1], strip, "")

		return record

	def get_ordinal(self, value):
			
		try:
			value = int(value)
		except ValueError:
			return value

		if value % 100//10 != 1:
			if value % 10 == 1:
				ordval = u"%d%s" % (value, "st")
			elif value % 10 == 2:
				ordval = u"%d%s" % (value, "nd")
			elif value % 10 == 3:
				ordval = u"%d%s" % (value, "rd")
			else:
				ordval = u"%d%s" % (value, "th")
		else:
			ordval = u"%d%s" % (value, "th")

		return ordval
	
	def fix_editions(self, record):
			
		if not record['250']:
			self.log(record, '250', "Attempting to fix 250 but no 250 found.")
			return record
		
		pipe_a = record['250']['a']
		
		subs = record['250'].subfields
		
		self.log(record, '250', "Attempting to fix 250 value to ordinal: %s" % pipe_a)						
		
		subs[1] = string.replace(subs[1], pipe_a, self.get_ordinal(pipe_a))
		
		return record
	
	def print_fields_by_tag(self, record, tag):

		for fields in record.get_fields(tag):
			print self.bcolors.WARNING + "%s" % fields + self.bcolors.ENDC
				
	def nuke_field_based_on_indicator(self, record, tag, indicator, value):
			
		"""
		Given an indicator (1st or 2nd), nuke the field if it has the value in that position.
		
		i.e. 650 with a second indicator of 7 (MESH heading)
		"""
		
		tag_list = record.get_fields(tag)
		
		final_list = []
		
		for field in tag_list:
				
			ouch = 0
			
			indicators = field.indicators
			
			if indicator == 2:
				try:
					val = indicators[1]
				except:
					ouch = 1
					
			if indicator == 1:
				try:
					val = indicators[0]
				except:
					ouch = 1
			
			if ouch == 0:
				if str(val) != str(value):
					final_list.append(field)
				else:
					message = "Removed field %s as %s was present in indicator %s - %s" % (tag, value, indicator, field)
					self.log(record, tag, message)
							
			else: 
				message = "Removed field %s as %s was present in indicator %s - %S" % (tag, value, indicator, field)
				self.log(record, tag, message)
				
				final_list.append(field)
	
			record.remove_field(field)
								
		for field in final_list:
			record.add_field(field)
				
		return record
	
	def log(self, record, field, message):
			
		"""
		Function which hands off to logger delegate object
		"""
		self.logger.log(record.leader, field, message)
	
	def parse_run(self):
		counter = 0
		
		if len(sys.argv) > 2:
		
			if (sys.argv[2] == "--colour") or (sys.argv[2] == "--color") or (sys.argv[2] == "-c"):
				self.bcolors = bcolors()
				
			if (sys.argv[2] == "--print") or (sys.argv[2] == "-p"):
				self.format = "print"
				
			if (sys.argv[2] == "--ebooks") or (sys.argv[2] == "-e"):
				self.format = "ebooks"
		
		if len(sys.argv) > 3:
		
			if (sys.argv[3] == "--colour") or (sys.argv[2] == "--color") or (sys.argv[3] == "-c"):
				self.bcolors = bcolors()
				
			if (sys.argv[3] == "--print") or (sys.argv[2] == "-p"):
				self.format = "print"
				
			if (sys.argv[3] == "--ebooks") or (sys.argv[3] == "-e"):
				self.format = "ebooks"
		
		if len(sys.argv) < 2:
			print "Usage: python marc-parser.py  [-c for colour] [-p for print records] [-e for eBooks]"
			sys.exit
		else:
		
			try:
				with open(sys.argv[1], 'rb') as fh:
					reader = MARCReader(fh)
					outfile = string.replace(sys.argv[1], '.mrc', ".final.%s.CLI.marc" % self.format)
					
					if(os.path.isfile(outfile)):
						os.remove(outfile)
					
					out = open(outfile, 'wb')
			
					for record in reader:		
						counter = counter + 1
						print " "
						print self.bcolors.FAIL + "%s" % counter
						record = self.process_record(record)
						out.write(record.as_marc())
					
					out.close
					print  "Wrote file:" + self.bcolors.FAIL + " %s" % outfile + self.bcolors.ENDC
					
			except IOError as e:
				print self.bcolors.FAIL + "Unable to open file %s!" % sys.argv[1] + self.bcolors.ENDC

if __name__ == "__main__":
    
    parser  = MARCParser()
    
    parser.parse_run()
Tags: Python
Posted by pj at April 20, 2015 08:52 PM
Paul Hollands

April 20, 2015

Python MARC record cleaner

Comments