April 20, 2015
Python MARC record cleaner
# -*- coding: UTF-8 -*- from pymarc import MARCReader from pymarc import Record, Field #from abc import ABCMeta, abstractmethod import re, string, os, json import sys, time from datetime import date """ Class for printing out colours on the CLI """ class bcolors: HEADER = '\033[95m' OKBLUE = '\033[94m' OKGREEN = '\033[92m' WARNING = '\033[93m' FAIL = '\033[91m' ENDC = '\033[0m' BOLD = '\033[1m' UNDERLINE = '\033[4m' """ Class for when we don't want colours (monkeys with 'more' use and puts garbage into files) """ class nocolors: HEADER = '' OKBLUE = '' OKGREEN = '' WARNING = '' FAIL = '' ENDC = '' BOLD = '' UNDERLINE = '' class ChangeLogger: """ Class for delegating logging functions to. Returns the log as JSON. """ def __init__(self): try: testit = "%s" % self.list except: self.set_list() # Set up the log list def set_list(self): self.list = {} # Main function to add to the log def log(self, leader, field, message): if leader in self.list: blah = 1 else: self.list[leader] = {} try: self.list[leader][field].append(message) except: self.list[leader][field] = [] self.list[leader][field].append(message) # Function to return the log as JSON def get_log(self, leader): return json.dumps(self.list[leader]) class MARCParser(): """ Generic (maybe later abstract) class with main MARC parser functions. """ def __init__(self): """ Instantiate the logger delegate object our colour writer and the deletions array. """ self.logger = ChangeLogger() self.bcolors = nocolors() self.deletions = ['520','533','655','776','505','500','588','590','980'] self.additions = [{'tag' : "909", 'value' : "Coutts MARC record for shelf-ready print. Cleaned PH %s" % date.today().isoformat()}] self.sub_deletions = [{'tag' : '245', 'sub' : 'h'}, {'tag' : '490', 'sub' : 'x'}] self.format = "print" def process_record(self, record): """ Function for processing each record. This should be moved to the concrete classes only (later). """ # Important or printing using as_marc() otherwise you getting an encoding error record.force_utf8 = 1 #title = record['245']['a'].strip(' \r\n\t\/\:') if self.format == "print": self.deletions.append('856') # Print the leader for comparison *before* we amend it. #print self.bcolors.HEADER + "%s" % title + self.bcolors.ENDC print self.bcolors.HEADER + "%s" % record.leader + self.bcolors.ENDC # Amend the leaders for additions record.leader = string.replace(record.leader, 'nam', 'nad') record.leader = string.replace(record.leader, 'cam', 'nad') self.log(record, 'LDR', "Replaced cam/nam with nad in the Leader") # Print out the record before we start to amend it using the yellow WARNING colour: for field in record.get_fields(): if not field == None: print self.bcolors.WARNING + "%s" % field + self.bcolors.ENDC # Tidy the call number in 082 to remove any separateor characters ' / record = self.clean_call_number(record) # Fix any editions entries and turn into an ordinal (e.g. 2nd) if necessary. record = self.fix_editions(record) # Remove any non-LCSH subject headings based on the value in the second sub-field indicator. for i in ['1','2','3','4','5','6','7']: record = self.nuke_field_based_on_indicator(record, '650', 2, i) # Remove copyright statement in 264 where the second indicator is a 4 record = self.nuke_field_based_on_indicator(record, '264', 2, '4') # Add any additional fields as defined in our hash set in the __init__ function for i in self.additions: record = self.add_simple_field(record, i['tag'], i['value']) # Delete those sub-fields defined in our sub_deletions array. See __init__ for i in self.sub_deletions: record = self.delete_subfield(record, i['tag'], i['sub']) # Delete those whole fields defined in our deletions array. See __init__ for deli in self.deletions: record = self.delete_field_by_tag(record, deli) # If we have an eBook set, process the 856s if self.format == 'ebooks': record = self.set_link_text(record) record = self.add_ezproxy(record) # Move any 440s into 490s record = self.switch_tag(record, '440', '490') print " " # Print out the processed record in white print self.bcolors.HEADER + "%s" % record.leader + self.bcolors.ENDC for field in record.get_fields(): if not field == None: print "%s" % field print " " # Print out the log print self.bcolors.OKGREEN + self.logger.get_log(record.leader) + self.bcolors.ENDC print " " # Priint out the raw MARC print self.bcolors.OKBLUE + record.as_marc() + self.bcolors.ENDC return record def switch_tag(self, record, old, new): try: fields = record.get_fields(old) except: return record for field in fields: field.tag = new if(len(fields) > 0): self.log(record, old, "Switched contents of %s to %s." % (old, new)) return record def add_ezproxy(self, record): """ Add the EZProxy prefix to the URL in 856 $u """ try: fields = record.get_fields('856') url = record['856']['u'] except: return record record = self.delete_subfield(record, '856', 'u') for field in fields: field.subfields.append('u') field.subfields.append("http://ezproxy.lib.le.ac.uk/login?url=%s" % url) self.log(record, '856', "Prefixed URL with EZPRoxy.") return record def set_link_text(self, record): """ Add proper link text for hyperlink into 856 $z """ record = self.delete_subfield(record, '856', 'z') fields = record.get_fields('856') title = record['245']['a'].strip(' \r\n\t\/\:') for field in fields: # Override field.indicators = ['4','0'] field.subfields.append('z') field.subfields.append("Access the eBook \"%s\"" % title) self.log(record, '856', "Set the link text in 856 $z: Access the eBook \"%s\"" % title) return record def add_simple_field(self, record, tagl, value): """ Add a new field with just a $a subfield and no indicators """ self.log(record, tagl, "Added new field %s with value of: %s" % (tagl, value)) subs = ['a', value] record.add_field(Field( tag = tagl, indicators = [' ', ' '], subfields = subs, )) return record def delete_subfield(self, record, tagl, del_sub): """ Turn the array into a hash so we can nuke by key. Better (dust) better. Yeah. """ self.log(record, tagl, "Attempting to remove %s subfields from %s" % (del_sub, tagl)) final = [] for field in record.get_fields(tagl): hash = self.subfields_to_dict(field.subfields) for key in sorted(hash.keys()): if not key == del_sub: final.append(key) final.append(hash[key]) #dicators = field.indicators #record.remove_field(field) """record.add_field(Field( tag = tagl, indicators = dicators, subfields = final, ))""" field.subfields = final return record def subfields_to_dict(self, l): return dict(zip(l[::2], l[1::2])) def delete_field_by_tag(self, record, tag): """ Pass the record and a tag and delete to field from the record """ try: fields = record.get_fields(tag) except: self.log(record, tag, "Tried deleting field(s) %s. None found." % tag) return record for field in fields: self.log(record, tag, "Deleted field(s) %s: %s" % (tag, field)) record.remove_field(field) return record def clean_call_number(self, record): """ Strip separators such as ' and / from the call number in the 082 field """ try: field = record['082'] except: return record try: subs = field.subfields except: return record #print "%s" % subs for strip in ["'","/"]: if strip in subs[1]: self.log(record, '082', "Found and stripped %s in 082" % strip) subs[1] = string.replace(subs[1], strip, "") return record def get_ordinal(self, value): try: value = int(value) except ValueError: return value if value % 100//10 != 1: if value % 10 == 1: ordval = u"%d%s" % (value, "st") elif value % 10 == 2: ordval = u"%d%s" % (value, "nd") elif value % 10 == 3: ordval = u"%d%s" % (value, "rd") else: ordval = u"%d%s" % (value, "th") else: ordval = u"%d%s" % (value, "th") return ordval def fix_editions(self, record): if not record['250']: self.log(record, '250', "Attempting to fix 250 but no 250 found.") return record pipe_a = record['250']['a'] subs = record['250'].subfields self.log(record, '250', "Attempting to fix 250 value to ordinal: %s" % pipe_a) subs[1] = string.replace(subs[1], pipe_a, self.get_ordinal(pipe_a)) return record def print_fields_by_tag(self, record, tag): for fields in record.get_fields(tag): print self.bcolors.WARNING + "%s" % fields + self.bcolors.ENDC def nuke_field_based_on_indicator(self, record, tag, indicator, value): """ Given an indicator (1st or 2nd), nuke the field if it has the value in that position. i.e. 650 with a second indicator of 7 (MESH heading) """ tag_list = record.get_fields(tag) final_list = [] for field in tag_list: ouch = 0 indicators = field.indicators if indicator == 2: try: val = indicators[1] except: ouch = 1 if indicator == 1: try: val = indicators[0] except: ouch = 1 if ouch == 0: if str(val) != str(value): final_list.append(field) else: message = "Removed field %s as %s was present in indicator %s - %s" % (tag, value, indicator, field) self.log(record, tag, message) else: message = "Removed field %s as %s was present in indicator %s - %S" % (tag, value, indicator, field) self.log(record, tag, message) final_list.append(field) record.remove_field(field) for field in final_list: record.add_field(field) return record def log(self, record, field, message): """ Function which hands off to logger delegate object """ self.logger.log(record.leader, field, message) def parse_run(self): counter = 0 if len(sys.argv) > 2: if (sys.argv[2] == "--colour") or (sys.argv[2] == "--color") or (sys.argv[2] == "-c"): self.bcolors = bcolors() if (sys.argv[2] == "--print") or (sys.argv[2] == "-p"): self.format = "print" if (sys.argv[2] == "--ebooks") or (sys.argv[2] == "-e"): self.format = "ebooks" if len(sys.argv) > 3: if (sys.argv[3] == "--colour") or (sys.argv[2] == "--color") or (sys.argv[3] == "-c"): self.bcolors = bcolors() if (sys.argv[3] == "--print") or (sys.argv[2] == "-p"): self.format = "print" if (sys.argv[3] == "--ebooks") or (sys.argv[3] == "-e"): self.format = "ebooks" if len(sys.argv) < 2: print "Usage: python marc-parser.py[-c for colour] [-p for print records] [-e for eBooks]" sys.exit else: try: with open(sys.argv[1], 'rb') as fh: reader = MARCReader(fh) outfile = string.replace(sys.argv[1], '.mrc', ".final.%s.CLI.marc" % self.format) if(os.path.isfile(outfile)): os.remove(outfile) out = open(outfile, 'wb') for record in reader: counter = counter + 1 print " " print self.bcolors.FAIL + "%s" % counter record = self.process_record(record) out.write(record.as_marc()) out.close print "Wrote file:" + self.bcolors.FAIL + " %s" % outfile + self.bcolors.ENDC except IOError as e: print self.bcolors.FAIL + "Unable to open file %s!" % sys.argv[1] + self.bcolors.ENDC if __name__ == "__main__": parser = MARCParser() parser.parse_run()
Posted by pj at 08:52 PM | Comments (0)