Paul Hollands: Python Archives

Python

April 20, 2015

Python MARC record cleaner

# -*- coding: UTF-8 -*-

from pymarc import MARCReader
from pymarc import Record, Field
#from abc import ABCMeta, abstractmethod

import re, string, os, json
import sys, time
from datetime import date

"""
Class for printing out colours on the CLI
"""

class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
	
"""
Class for when we don't want colours (monkeys with 'more' use and puts garbage into files)
"""
class nocolors:
    HEADER = ''
    OKBLUE = ''
    OKGREEN = ''
    WARNING = ''
    FAIL = ''
    ENDC = ''
    BOLD = ''
    UNDERLINE = ''

class ChangeLogger:
	
	"""
	Class for delegating logging functions to.
	
	Returns the log as JSON.
	"""
	
	def __init__(self):
		try:
			testit = "%s" % self.list
		except:
			self.set_list()
	
	# Set up the log list
	def set_list(self):
		self.list = {}
	
	# Main function to add to the log
	def log(self, leader, field, message):
		if leader in self.list:
			blah = 1
		else:
			self.list[leader] = {}
				 
		try:
			self.list[leader][field].append(message)

		except:
			self.list[leader][field] = []
			self.list[leader][field].append(message)
	
	# Function to return the log as JSON				
	def get_log(self, leader):
			return json.dumps(self.list[leader])

class MARCParser():
	
	"""
	Generic (maybe later abstract) class with main MARC parser functions.
	"""
	
	def __init__(self):
		
		"""
		Instantiate the logger delegate object our colour writer and the deletions array.
		"""
		self.logger = ChangeLogger()
		
		self.bcolors = nocolors()
		
		self.deletions = ['520','533','655','776','505','500','588','590','980']
		
		self.additions = [{'tag' : "909", 'value' : "Coutts MARC record for shelf-ready print. Cleaned PH %s" % date.today().isoformat()}]
		
		self.sub_deletions = [{'tag' : '245', 'sub' : 'h'}, {'tag' : '490', 'sub' : 'x'}]
		
		self.format = "print"
			
	def process_record(self, record):
			
		"""
		Function for processing each record. This should be moved to the concrete classes only (later).
		"""
		
		# Important or printing using as_marc() otherwise you getting an encoding error 
		
		record.force_utf8 = 1
		
		#title = record['245']['a'].strip(' \r\n\t\/\:')
		
		if self.format == "print": self.deletions.append('856')
		
		# Print the leader for comparison *before* we amend it.
		
		#print self.bcolors.HEADER + "%s" % title + self.bcolors.ENDC
		
		print self.bcolors.HEADER + "%s" % record.leader + self.bcolors.ENDC
		
		# Amend the leaders for additions
		
		record.leader = string.replace(record.leader, 'nam', 'nad')
		record.leader = string.replace(record.leader, 'cam', 'nad')
		
		self.log(record, 'LDR', "Replaced cam/nam with nad in the Leader")
		
		# Print out the record before we start to amend it using the yellow WARNING colour:
		
		for field in record.get_fields():
				if not field == None: print self.bcolors.WARNING + "%s" % field + self.bcolors.ENDC
		
		# Tidy the call number in 082 to remove any separateor characters ' /
		
		record = self.clean_call_number(record)

		# Fix any editions entries and turn into an ordinal (e.g. 2nd) if necessary.
		
		record = self.fix_editions(record)
		
		# Remove any non-LCSH subject headings based on the value in the second sub-field indicator.
		
		for i in ['1','2','3','4','5','6','7']:
				record = self.nuke_field_based_on_indicator(record, '650', 2, i)
				
		# Remove copyright statement in 264 where the second indicator is a 4
				
		record = self.nuke_field_based_on_indicator(record, '264', 2, '4')
		
		# Add any additional fields as defined in our hash set in the __init__ function
		
		for i in self.additions:
			record = self.add_simple_field(record, i['tag'], i['value'])
		
		# Delete those sub-fields defined in our sub_deletions array. See __init__
		
		for i in self.sub_deletions:
			record = self.delete_subfield(record,  i['tag'], i['sub'])
		
		# Delete those whole fields defined in our deletions array. See __init__
		
		for deli in self.deletions:
				record = self.delete_field_by_tag(record, deli)
		
		# If we have an eBook set, process the 856s
		
		if self.format == 'ebooks':
			record = self.set_link_text(record)
			record = self.add_ezproxy(record)
		
		# Move any 440s into 490s
		
		record = self.switch_tag(record, '440', '490')
	
		print " "
		
		# Print out the processed record in white
		
		print self.bcolors.HEADER + "%s" % record.leader + self.bcolors.ENDC
		for field in record.get_fields():
				if not field == None: print "%s" % field
				
		print " "
		
		# Print out the log
		
		print self.bcolors.OKGREEN + self.logger.get_log(record.leader) + self.bcolors.ENDC
		
		print " "
		
		# Priint out the raw MARC
		
		print self.bcolors.OKBLUE + record.as_marc() + self.bcolors.ENDC
	
		return record
	
	def switch_tag(self, record, old, new):
	
		try:
			fields = record.get_fields(old)
		
		except:
			return record
		
		for field in fields:
			field.tag = new
		
		if(len(fields) > 0): self.log(record, old, "Switched contents of %s to %s." % (old, new))
		
		return record
	
	def add_ezproxy(self, record):
	
		"""
		Add the EZProxy prefix to the URL in 856 $u
		"""
		try:
			fields = record.get_fields('856')
		
			url = record['856']['u']
		
		except:
			return record
		
		record = self.delete_subfield(record, '856', 'u')
		
		for field in fields:
			field.subfields.append('u')
			field.subfields.append("http://ezproxy.lib.le.ac.uk/login?url=%s" % url)
			
		self.log(record, '856', "Prefixed URL with EZPRoxy.")
	
		return record
	
	def set_link_text(self, record):
	
		"""
		Add proper link text for hyperlink into 856 $z
		"""
	
		record = self.delete_subfield(record, '856', 'z')
	
		fields = record.get_fields('856')
		
		title = record['245']['a'].strip(' \r\n\t\/\:')
		
		for field in fields:
			# Override 
			field.indicators = ['4','0']
			field.subfields.append('z')
			field.subfields.append("Access the eBook \"%s\"" % title)
		
		self.log(record, '856', "Set the link text in 856 $z: Access the eBook \"%s\"" % title)
		
		return record
	
	def add_simple_field(self, record, tagl, value):
	
		"""
		Add a new field with just a $a subfield and no indicators
		"""
		
		self.log(record, tagl, "Added new field %s with value of: %s" % (tagl, value))
		
		subs = ['a', value]
		
		record.add_field(Field(
		tag = tagl,
		indicators = [' ', ' '],
		subfields = subs,
		))
		
		return record

	def delete_subfield(self, record, tagl, del_sub):
	
		"""
		Turn the array into a hash so we can nuke by key. Better (dust) better. Yeah.
		"""
		
		self.log(record, tagl, "Attempting to remove %s subfields from %s" % (del_sub, tagl))
		
		final = []
	
		for field in record.get_fields(tagl):
			hash = self.subfields_to_dict(field.subfields)
			
			for key in sorted(hash.keys()):
				if not key == del_sub:
					final.append(key)
					final.append(hash[key])
		
			#dicators = field.indicators
		
			#record.remove_field(field)
		
			"""record.add_field(Field(
				tag = tagl,
				indicators = dicators,
				subfields = final,
				))"""
				
			field.subfields = final
		
		return record
	
	def subfields_to_dict(self, l):
	
		return dict(zip(l[::2], l[1::2]))
	
	def delete_field_by_tag(self, record, tag):
			
		"""
		Pass the record and a tag and delete to field from the record
		"""
		try:
			fields = record.get_fields(tag)
				
		except:
			self.log(record, tag, "Tried deleting field(s) %s. None found." % tag)
			return record
		
		for field in fields:
			self.log(record, tag, "Deleted field(s) %s: %s" % (tag, field))
			record.remove_field(field)

		return record

	def clean_call_number(self, record):

		"""
		Strip separators such as ' and / from the call number in the 082 field
		"""

		try:
			field = record['082']

		except:
			return record

		try:
			subs = field.subfields
		except:
			return record

		#print "%s" % subs

		for strip in ["'","/"]:
			if strip in subs[1]:
				self.log(record, '082', "Found and stripped %s in 082" % strip)
				subs[1] = string.replace(subs[1], strip, "")

		return record

	def get_ordinal(self, value):
			
		try:
			value = int(value)
		except ValueError:
			return value

		if value % 100//10 != 1:
			if value % 10 == 1:
				ordval = u"%d%s" % (value, "st")
			elif value % 10 == 2:
				ordval = u"%d%s" % (value, "nd")
			elif value % 10 == 3:
				ordval = u"%d%s" % (value, "rd")
			else:
				ordval = u"%d%s" % (value, "th")
		else:
			ordval = u"%d%s" % (value, "th")

		return ordval
	
	def fix_editions(self, record):
			
		if not record['250']:
			self.log(record, '250', "Attempting to fix 250 but no 250 found.")
			return record
		
		pipe_a = record['250']['a']
		
		subs = record['250'].subfields
		
		self.log(record, '250', "Attempting to fix 250 value to ordinal: %s" % pipe_a)						
		
		subs[1] = string.replace(subs[1], pipe_a, self.get_ordinal(pipe_a))
		
		return record
	
	def print_fields_by_tag(self, record, tag):

		for fields in record.get_fields(tag):
			print self.bcolors.WARNING + "%s" % fields + self.bcolors.ENDC
				
	def nuke_field_based_on_indicator(self, record, tag, indicator, value):
			
		"""
		Given an indicator (1st or 2nd), nuke the field if it has the value in that position.
		
		i.e. 650 with a second indicator of 7 (MESH heading)
		"""
		
		tag_list = record.get_fields(tag)
		
		final_list = []
		
		for field in tag_list:
				
			ouch = 0
			
			indicators = field.indicators
			
			if indicator == 2:
				try:
					val = indicators[1]
				except:
					ouch = 1
					
			if indicator == 1:
				try:
					val = indicators[0]
				except:
					ouch = 1
			
			if ouch == 0:
				if str(val) != str(value):
					final_list.append(field)
				else:
					message = "Removed field %s as %s was present in indicator %s - %s" % (tag, value, indicator, field)
					self.log(record, tag, message)
							
			else: 
				message = "Removed field %s as %s was present in indicator %s - %S" % (tag, value, indicator, field)
				self.log(record, tag, message)
				
				final_list.append(field)
	
			record.remove_field(field)
								
		for field in final_list:
			record.add_field(field)
				
		return record
	
	def log(self, record, field, message):
			
		"""
		Function which hands off to logger delegate object
		"""
		self.logger.log(record.leader, field, message)
	
	def parse_run(self):
		counter = 0
		
		if len(sys.argv) > 2:
		
			if (sys.argv[2] == "--colour") or (sys.argv[2] == "--color") or (sys.argv[2] == "-c"):
				self.bcolors = bcolors()
				
			if (sys.argv[2] == "--print") or (sys.argv[2] == "-p"):
				self.format = "print"
				
			if (sys.argv[2] == "--ebooks") or (sys.argv[2] == "-e"):
				self.format = "ebooks"
		
		if len(sys.argv) > 3:
		
			if (sys.argv[3] == "--colour") or (sys.argv[2] == "--color") or (sys.argv[3] == "-c"):
				self.bcolors = bcolors()
				
			if (sys.argv[3] == "--print") or (sys.argv[2] == "-p"):
				self.format = "print"
				
			if (sys.argv[3] == "--ebooks") or (sys.argv[3] == "-e"):
				self.format = "ebooks"
		
		if len(sys.argv) < 2:
			print "Usage: python marc-parser.py  [-c for colour] [-p for print records] [-e for eBooks]"
			sys.exit
		else:
		
			try:
				with open(sys.argv[1], 'rb') as fh:
					reader = MARCReader(fh)
					outfile = string.replace(sys.argv[1], '.mrc', ".final.%s.CLI.marc" % self.format)
					
					if(os.path.isfile(outfile)):
						os.remove(outfile)
					
					out = open(outfile, 'wb')
			
					for record in reader:		
						counter = counter + 1
						print " "
						print self.bcolors.FAIL + "%s" % counter
						record = self.process_record(record)
						out.write(record.as_marc())
					
					out.close
					print  "Wrote file:" + self.bcolors.FAIL + " %s" % outfile + self.bcolors.ENDC
					
			except IOError as e:
				print self.bcolors.FAIL + "Unable to open file %s!" % sys.argv[1] + self.bcolors.ENDC

if __name__ == "__main__":
    
    parser  = MARCParser()
    
    parser.parse_run()

Posted by pj at 08:52 PM | Comments (0)

September 17, 2011

I love Python

import glob, os, re, string

def rename(dir, pattern):

    for path_filename in glob.glob(os.path.join(dir, pattern)):

        title, ext = os.path.splitext(os.path.basename(path_filename))

	new_title = '''%s''' %(re.sub(r'[\W]+','-',title))

	print new_title.lower()

        os.rename(path_filename, os.path.join(dir, new_title.lower() + ext))

rename(r'/home/phollan2/crc_papers_2011',r'*.pdf')

Posted by pj at 01:28 PM | Comments (0)

September 03, 2011

Cosign with Django

Django | Authentication using REMOTE_USER | Django documentation

Posted by pj at 06:53 PM | Comments (0)

October 09, 2009

PostgreSQL version of do_sql.py

import sys

import postgresql

from read_config import read_config

import time

import string

import re

def parse_insert_sql(sql):

	response = {}

	da_list = re.split("[\s\(\)]+", sql.lower().strip())
	
	print(da_list)

	ma_table = da_list[2]

	if da_list[0] == 'insert':
	
		response['is_insert'] = 1
		
		if "returning %s_id" % (ma_table) in da_list:
		
			response['sql'] = sql
			
		else:
		
			response['sql'] = sql + " returning %s_id as insert_id" % (ma_table)
			
	else:
	
		response['is_insert'] = 0
	
		response['sql'] = sql
			
	return response

def do_sql_query(db_config_file, sql, debug):

	response = {}

	results = ()

	"""

	Takes config file name, your SQL and a debug parameter.

	Returns a dictionary with a boolean, an error and a warning string and 
	a list of results dictionaries.

	"""

	connection_map = read_config('%s.ini' % db_config_file, debug)

	(host, user, passwd, db) = (connection_map['connection_parameters']['host'],
	connection_map['connection_parameters']['user'],
	connection_map['connection_parameters']['password'],
	connection_map['connection_parameters']['db']) 
	
	connection = postgresql.open("pq://%s:%s@%s/%s" % (user, passwd, host, db))
	
	insert_response = parse_insert_sql(sql)
		
	if insert_response['is_insert'] == 1:

		sql = insert_response['sql']
	
	try:
		
		results = connection.prepare(sql)

		response['results'] = results

		response['status'] = 1

		if insert_response['is_insert'] == 1:
	
			response['insert_id'] = results.first()

		return(response)		

	except: 
	
		if debug == 1: 

			print ("\nPostgreSQL error: %s\n" % (sql))

		response['status'] = 0

		response['error'] = "\nPostgreSQL error: %s\n" % (sql)

		response['results'] = results

		return(response)	
	
if __name__ == "__main__":

	response = do_sql_query('waf_common', "insert into stuff(stuff_id, nonsense) values(nextval('stuff_seq'::regclass), 'Glug')", 0)

	print (response['insert_id'])
		
	response = do_sql_query('waf_common', "select * from staff limit 10", 0)

	print (response['results'].first())
	
	for record in response['results']:
	
		for column in record.column_names:
		
			print("%s : %s" % (column, record[column]))
			
		print("\n ----------- \n")

Posted by pj at 05:00 PM | Comments (0)

March 29, 2009

Getting MySQL-python-1.2.2 to work with XAMPP

I'm trying to get the Python MySQLdb library to talk to my XAMMP MySQL. Here's how:

1. Before you build the db adaptor, change the site.cfg file to point to XAMMP's mysql_config

# The path to mysql_config.
# Only use this if mysql_config is not on your PATH, or you have some weird
# setup that requires it.
mysql_config = /Applications/xampp/xamppfiles/bin/mysql_config

2. Link the dylib

cp /Applications/xampp/xamppfiles/lib/mysql/libmysqlclient.15.dylib /usr/local/mysql/lib/mysql/libmysqlclient_r.15.dylib 

mkdir /Applications/xampp/xamppfiles/include
ln -s /usr/local/mysql-5.1.32-osx10.5-powerpc/include /Applications/xampp/xamppfiles/include/mysql

3. You have to point your script to the localhost using the machines acutally IP address as if it was a remote server.

Posted by pj at 11:59 AM | Comments (0)

CherryPy HTMLTemplate

HTMLTemplate - CherryPy Tools - Trac

Posted by pj at 02:15 AM | Comments (0)

March 26, 2009

Python script for producing svn diffs


import os

my_file = open('changed_files.txt','r')

lines = my_file.readlines()

for line in lines:

  plode = line.strip()

  els = plode.split('/')
    
  os.popen("svn diff -r 455 " + line.strip() + " > " + "_".join(els) + ".diff")

  print "svn diff -r 455 " + line.strip() + " > " + els[-1] + ".diff"

Posted by pj at 09:31 PM | Comments (0)

March 24, 2009

CherryPy does ZPT

ChoosingATemplatingLanguage - CherryPy - Trac

Posted by pj at 11:14 AM | Comments (0)

April 15, 2008

Python script for getting changed files

The following Python script gets you a list of files changed between two subversion revisions:

import sys
import os
import re

lines = []

for counter in range(int(sys.argv[1]) - 1, int(sys.argv[2]) + 1):
    lines.append(os.popen('/usr/local/bin/svn log -vv -r  ' + str(counter)));

tally = {}

for results in lines:
    for line in results:
        reg = re.compile("svn-repository")
        if reg.search(line):
            tally[line] = 1

final = tally.keys()

final.sort()

print " " + " ".join(final)

Posted by pj at 05:06 PM | Comments (0)

December 14, 2007

pysvn Programmer's Guide

pysvn: pysvn Programmer's Guide

Posted by pj at 09:03 PM | Comments (0)

August 11, 2006

More about the Python in PHP project

PiP - Python in PHP

Posted by pj at 10:06 AM | Comments (0)

Python interpreter embedded in PHP

PECL :: Package :: python

Posted by pj at 10:03 AM | Comments (0)

August 03, 2006

Parsing mutliple date formats in Python

try:
    date_time = datetime.datetime(*time.strptime(this_val, "%d/%m/%Y")[0:5])
except:
    pass
try:
    date_time = datetime.datetime(*time.strptime(this_val, "%B %Y")[0:5])
except:
     pass
try:
     date_time = datetime.datetime(*time.strptime(this_val, "%Y")[0:5])
except:
     pass
print "<!--" + str(date_time) + "-->"

Posted by pj at 02:58 PM | Comments (0)

Very cunning Python based Universal Feed Parser

Universal Feed Parser

Posted by pj at 12:51 PM | Comments (0)

May 22, 2006

Notes for using Python `urllib2`

urllib2 - The Missing Manual

Posted by pj at 12:43 PM | Comments (0)

January 16, 2006

Python charting interface

PyGDChart2 - http://www.nullcube.com/software/pygdchart2/doc/index.html

Posted by pj at 08:32 PM

November 16, 2005

Parsing dates in Python

Dates and Times

Posted by pj at 03:42 PM

September 14, 2005

Parsing binary data with Python

4.3 struct -- Interpret strings as packed binary data

Another alternative library is also available:

- http://www.nightmare.com/software.html

Posted by pj at 11:49 AM

August 09, 2005

mysql_robot2.py site indexing script

I've uploaded a copy of my mysql_robot2.py site indexing script for safe keeping.

And here's the DB structure:

mysql_robot.sql

Posted by pj at 10:40 AM

July 17, 2005

Yet another Python Web Framework

Django | The Web framework for perfectionists with deadlines

Posted by pj at 02:38 PM

May 27, 2005

A web crawler written in Python

http://www.newton.cx/~peter/software/crawler.py

Posted by pj at 01:20 PM

Example of how to go from Unicode to HTML

Unicode to HTML - tiddly-pom.com

Posted by pj at 12:54 PM

Information about the Python urlparse module

The urlparse module ::: The Standard Python Library (2005) ::: www.effbot.org

Posted by pj at 12:46 PM

A Python based HTML parser which handles tag soup and does tidying automaticamente

Beautiful Soup: We called him Tortoise because he taught us.

Posted by pj at 11:59 AM

May 26, 2005

Rare urllib2 example for handling response codes

ASPN : Python Cookbook : urllib2 for actions depending on http response codes

Posted by pj at 12:11 PM

May 23, 2005

Handling Unicode encoding in XML with Python

XML.com: Unicode Secrets

Posted by pj at 10:02 PM

April 20, 2005

Testing counters to see if they are odd or even

The following is a simple Python script to test whether a number is odd or even:

test_figure = float(test_figure)
half_of_it = float(test_figure/2)
#print str(half_of_it)
if int(half_of_it) == float(half_of_it):
  return 'even'
else: return 'odd'

It relies on the fact that casting a floating point number to an integer will round it up or down, and that dividing an odd number by two always gives a float, whereas dividing an even number gives an integer.

Posted by pj at 11:14 AM

February 15, 2005

Running JavaScript in Python

But why?

python-spidermonkey

Posted by pj at 02:43 PM

October 30, 2004

Blogging iTunes playlists - RSS feeds

Found this posting by Kimbro Staken about a Python script he's written which posts his iTunes playlist to his blog using in-line AppleScript and the MT XML-RPC.

< http://www.xmldatabases.org/movabletype/archives/000159.html >

Should be straight forward to retask it to post 'now playing' titles too?

Posted by pj at 10:58 AM

Paul Hollands