« Some sample OS Ticket SQL | Main
April 20, 2015
Python MARC record cleaner
# -*- coding: UTF-8 -*-
from pymarc import MARCReader
from pymarc import Record, Field
#from abc import ABCMeta, abstractmethod
import re, string, os, json
import sys, time
from datetime import date
"""
Class for printing out colours on the CLI
"""
class bcolors:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
"""
Class for when we don't want colours (monkeys with 'more' use and puts garbage into files)
"""
class nocolors:
HEADER = ''
OKBLUE = ''
OKGREEN = ''
WARNING = ''
FAIL = ''
ENDC = ''
BOLD = ''
UNDERLINE = ''
class ChangeLogger:
"""
Class for delegating logging functions to.
Returns the log as JSON.
"""
def __init__(self):
try:
testit = "%s" % self.list
except:
self.set_list()
# Set up the log list
def set_list(self):
self.list = {}
# Main function to add to the log
def log(self, leader, field, message):
if leader in self.list:
blah = 1
else:
self.list[leader] = {}
try:
self.list[leader][field].append(message)
except:
self.list[leader][field] = []
self.list[leader][field].append(message)
# Function to return the log as JSON
def get_log(self, leader):
return json.dumps(self.list[leader])
class MARCParser():
"""
Generic (maybe later abstract) class with main MARC parser functions.
"""
def __init__(self):
"""
Instantiate the logger delegate object our colour writer and the deletions array.
"""
self.logger = ChangeLogger()
self.bcolors = nocolors()
self.deletions = ['520','533','655','776','505','500','588','590','980']
self.additions = [{'tag' : "909", 'value' : "Coutts MARC record for shelf-ready print. Cleaned PH %s" % date.today().isoformat()}]
self.sub_deletions = [{'tag' : '245', 'sub' : 'h'}, {'tag' : '490', 'sub' : 'x'}]
self.format = "print"
def process_record(self, record):
"""
Function for processing each record. This should be moved to the concrete classes only (later).
"""
# Important or printing using as_marc() otherwise you getting an encoding error
record.force_utf8 = 1
#title = record['245']['a'].strip(' \r\n\t\/\:')
if self.format == "print": self.deletions.append('856')
# Print the leader for comparison *before* we amend it.
#print self.bcolors.HEADER + "%s" % title + self.bcolors.ENDC
print self.bcolors.HEADER + "%s" % record.leader + self.bcolors.ENDC
# Amend the leaders for additions
record.leader = string.replace(record.leader, 'nam', 'nad')
record.leader = string.replace(record.leader, 'cam', 'nad')
self.log(record, 'LDR', "Replaced cam/nam with nad in the Leader")
# Print out the record before we start to amend it using the yellow WARNING colour:
for field in record.get_fields():
if not field == None: print self.bcolors.WARNING + "%s" % field + self.bcolors.ENDC
# Tidy the call number in 082 to remove any separateor characters ' /
record = self.clean_call_number(record)
# Fix any editions entries and turn into an ordinal (e.g. 2nd) if necessary.
record = self.fix_editions(record)
# Remove any non-LCSH subject headings based on the value in the second sub-field indicator.
for i in ['1','2','3','4','5','6','7']:
record = self.nuke_field_based_on_indicator(record, '650', 2, i)
# Remove copyright statement in 264 where the second indicator is a 4
record = self.nuke_field_based_on_indicator(record, '264', 2, '4')
# Add any additional fields as defined in our hash set in the __init__ function
for i in self.additions:
record = self.add_simple_field(record, i['tag'], i['value'])
# Delete those sub-fields defined in our sub_deletions array. See __init__
for i in self.sub_deletions:
record = self.delete_subfield(record, i['tag'], i['sub'])
# Delete those whole fields defined in our deletions array. See __init__
for deli in self.deletions:
record = self.delete_field_by_tag(record, deli)
# If we have an eBook set, process the 856s
if self.format == 'ebooks':
record = self.set_link_text(record)
record = self.add_ezproxy(record)
# Move any 440s into 490s
record = self.switch_tag(record, '440', '490')
print " "
# Print out the processed record in white
print self.bcolors.HEADER + "%s" % record.leader + self.bcolors.ENDC
for field in record.get_fields():
if not field == None: print "%s" % field
print " "
# Print out the log
print self.bcolors.OKGREEN + self.logger.get_log(record.leader) + self.bcolors.ENDC
print " "
# Priint out the raw MARC
print self.bcolors.OKBLUE + record.as_marc() + self.bcolors.ENDC
return record
def switch_tag(self, record, old, new):
try:
fields = record.get_fields(old)
except:
return record
for field in fields:
field.tag = new
if(len(fields) > 0): self.log(record, old, "Switched contents of %s to %s." % (old, new))
return record
def add_ezproxy(self, record):
"""
Add the EZProxy prefix to the URL in 856 $u
"""
try:
fields = record.get_fields('856')
url = record['856']['u']
except:
return record
record = self.delete_subfield(record, '856', 'u')
for field in fields:
field.subfields.append('u')
field.subfields.append("http://ezproxy.lib.le.ac.uk/login?url=%s" % url)
self.log(record, '856', "Prefixed URL with EZPRoxy.")
return record
def set_link_text(self, record):
"""
Add proper link text for hyperlink into 856 $z
"""
record = self.delete_subfield(record, '856', 'z')
fields = record.get_fields('856')
title = record['245']['a'].strip(' \r\n\t\/\:')
for field in fields:
# Override
field.indicators = ['4','0']
field.subfields.append('z')
field.subfields.append("Access the eBook \"%s\"" % title)
self.log(record, '856', "Set the link text in 856 $z: Access the eBook \"%s\"" % title)
return record
def add_simple_field(self, record, tagl, value):
"""
Add a new field with just a $a subfield and no indicators
"""
self.log(record, tagl, "Added new field %s with value of: %s" % (tagl, value))
subs = ['a', value]
record.add_field(Field(
tag = tagl,
indicators = [' ', ' '],
subfields = subs,
))
return record
def delete_subfield(self, record, tagl, del_sub):
"""
Turn the array into a hash so we can nuke by key. Better (dust) better. Yeah.
"""
self.log(record, tagl, "Attempting to remove %s subfields from %s" % (del_sub, tagl))
final = []
for field in record.get_fields(tagl):
hash = self.subfields_to_dict(field.subfields)
for key in sorted(hash.keys()):
if not key == del_sub:
final.append(key)
final.append(hash[key])
#dicators = field.indicators
#record.remove_field(field)
"""record.add_field(Field(
tag = tagl,
indicators = dicators,
subfields = final,
))"""
field.subfields = final
return record
def subfields_to_dict(self, l):
return dict(zip(l[::2], l[1::2]))
def delete_field_by_tag(self, record, tag):
"""
Pass the record and a tag and delete to field from the record
"""
try:
fields = record.get_fields(tag)
except:
self.log(record, tag, "Tried deleting field(s) %s. None found." % tag)
return record
for field in fields:
self.log(record, tag, "Deleted field(s) %s: %s" % (tag, field))
record.remove_field(field)
return record
def clean_call_number(self, record):
"""
Strip separators such as ' and / from the call number in the 082 field
"""
try:
field = record['082']
except:
return record
try:
subs = field.subfields
except:
return record
#print "%s" % subs
for strip in ["'","/"]:
if strip in subs[1]:
self.log(record, '082', "Found and stripped %s in 082" % strip)
subs[1] = string.replace(subs[1], strip, "")
return record
def get_ordinal(self, value):
try:
value = int(value)
except ValueError:
return value
if value % 100//10 != 1:
if value % 10 == 1:
ordval = u"%d%s" % (value, "st")
elif value % 10 == 2:
ordval = u"%d%s" % (value, "nd")
elif value % 10 == 3:
ordval = u"%d%s" % (value, "rd")
else:
ordval = u"%d%s" % (value, "th")
else:
ordval = u"%d%s" % (value, "th")
return ordval
def fix_editions(self, record):
if not record['250']:
self.log(record, '250', "Attempting to fix 250 but no 250 found.")
return record
pipe_a = record['250']['a']
subs = record['250'].subfields
self.log(record, '250', "Attempting to fix 250 value to ordinal: %s" % pipe_a)
subs[1] = string.replace(subs[1], pipe_a, self.get_ordinal(pipe_a))
return record
def print_fields_by_tag(self, record, tag):
for fields in record.get_fields(tag):
print self.bcolors.WARNING + "%s" % fields + self.bcolors.ENDC
def nuke_field_based_on_indicator(self, record, tag, indicator, value):
"""
Given an indicator (1st or 2nd), nuke the field if it has the value in that position.
i.e. 650 with a second indicator of 7 (MESH heading)
"""
tag_list = record.get_fields(tag)
final_list = []
for field in tag_list:
ouch = 0
indicators = field.indicators
if indicator == 2:
try:
val = indicators[1]
except:
ouch = 1
if indicator == 1:
try:
val = indicators[0]
except:
ouch = 1
if ouch == 0:
if str(val) != str(value):
final_list.append(field)
else:
message = "Removed field %s as %s was present in indicator %s - %s" % (tag, value, indicator, field)
self.log(record, tag, message)
else:
message = "Removed field %s as %s was present in indicator %s - %S" % (tag, value, indicator, field)
self.log(record, tag, message)
final_list.append(field)
record.remove_field(field)
for field in final_list:
record.add_field(field)
return record
def log(self, record, field, message):
"""
Function which hands off to logger delegate object
"""
self.logger.log(record.leader, field, message)
def parse_run(self):
counter = 0
if len(sys.argv) > 2:
if (sys.argv[2] == "--colour") or (sys.argv[2] == "--color") or (sys.argv[2] == "-c"):
self.bcolors = bcolors()
if (sys.argv[2] == "--print") or (sys.argv[2] == "-p"):
self.format = "print"
if (sys.argv[2] == "--ebooks") or (sys.argv[2] == "-e"):
self.format = "ebooks"
if len(sys.argv) > 3:
if (sys.argv[3] == "--colour") or (sys.argv[2] == "--color") or (sys.argv[3] == "-c"):
self.bcolors = bcolors()
if (sys.argv[3] == "--print") or (sys.argv[2] == "-p"):
self.format = "print"
if (sys.argv[3] == "--ebooks") or (sys.argv[3] == "-e"):
self.format = "ebooks"
if len(sys.argv) < 2:
print "Usage: python marc-parser.py [-c for colour] [-p for print records] [-e for eBooks]"
sys.exit
else:
try:
with open(sys.argv[1], 'rb') as fh:
reader = MARCReader(fh)
outfile = string.replace(sys.argv[1], '.mrc', ".final.%s.CLI.marc" % self.format)
if(os.path.isfile(outfile)):
os.remove(outfile)
out = open(outfile, 'wb')
for record in reader:
counter = counter + 1
print " "
print self.bcolors.FAIL + "%s" % counter
record = self.process_record(record)
out.write(record.as_marc())
out.close
print "Wrote file:" + self.bcolors.FAIL + " %s" % outfile + self.bcolors.ENDC
except IOError as e:
print self.bcolors.FAIL + "Unable to open file %s!" % sys.argv[1] + self.bcolors.ENDC
if __name__ == "__main__":
parser = MARCParser()
parser.parse_run()
Tags: Python
Posted by pj at April 20, 2015 08:52 PM