From 4fca5c020ba2f7d4164d5fe24e576f83d5bfeda3 Mon Sep 17 00:00:00 2001 From: Ian Young Date: Fri, 8 Nov 2013 16:10:25 +0000 Subject: [PATCH] New fragment normalisation script, and script to apply same to all fragments. --- build/normalise_all_fragments | 4 ++ build/normalise_fragment | 131 ++++++++++++++++++++++++++++++++++ 2 files changed, 135 insertions(+) create mode 100755 build/normalise_all_fragments create mode 100755 build/normalise_fragment diff --git a/build/normalise_all_fragments b/build/normalise_all_fragments new file mode 100755 index 00000000..371cb0be --- /dev/null +++ b/build/normalise_all_fragments @@ -0,0 +1,4 @@ +#!/bin/bash + +find ../entities -name uk\*.xml \ + -exec python normalise_fragment \{\} \; diff --git a/build/normalise_fragment b/build/normalise_fragment new file mode 100755 index 00000000..af105939 --- /dev/null +++ b/build/normalise_fragment @@ -0,0 +1,131 @@ +#!/usr/bin/env python + +''' +Normalises a fragment file. + +The assumption is that the input file has already been pretty-printed to +some extent. This script normalises it by adjusting the start of the +file: + + * ensures that the file starts with an appropriate XML declaration + + * arranges for all appropriate namespaces to appear on the EntityDescriptor + + * arranges for an appropriate collection on schemaLocation values + + * puts any ID and entityID attributes in the right place + +The script also modifies any use of the shibmeta prefix to the modern shibmd form. + +None of this can really be done by any means within XML itself, so this +is a pure text processing application. + +With no command-line arguments, the script acts as a filter. + +With one command-line argument, the script overwrites the named file. + +With two command-line arguments, the script reads from one file and +writes to another. + +This script was developed under Python 2.7, but will probably work under 2.6. +Let me know if that turns out not to be the case. +''' + +import re +import sys +from string import Template + +# +# Template to use for the start of the file, up to the EntityDescriptor. +# +# Note that the indentation in this template should be four *spaces* per +# level, independent of the type of indentation used for the rest of the +# script. +# +ED_TEMPLATE = Template(''' + +''') + +def ingest(input): + '''Read the given input file and split it into header and remainder.''' + text = input.read() + regex = r"^(.*]+?>[ \t]*?\n?)(.*)$" + pattern = re.compile(regex, re.DOTALL) + return pattern.match(text).groups() + +def extract(header): + '''Extract ID and entityID attributes from the header.''' + entityID = re.search(r"(?:\bentityID=[\'\"](.*?)[\'\"])", header).group(1) + ID = re.search(r"(?:\bID=[\'\"](.*?)[\'\"])", header).group(1) + return ID, entityID + +def construct_header(ID, entityID): + '''Construct a new header with the given attributes.''' + return ED_TEMPLATE.substitute(ID=ID, entityID=entityID) + +def fix_scope_prefix(text): + '''Changes any use of the shibmeta prefix into shibmd.''' + return re.sub(r"\bshibmeta:", "shibmd:", text) + +def usage(): + '''Display usage information for script.''' + print '''Usage: normalise_fragment.py [[infile] outfile]''' + +def main(args): + '''Command-line application.''' + if not args: + # no command line arguments, read from standard input + infile = sys.stdin + elif len(args) <= 2: + # read from a named file + infile = file(args[0], "r") + else: + usage(); + sys.exit(1) + + (head, remainder) = ingest(infile) + infile.close() + # print "head:%s:\nrest:%s:" % (head, remainder) + (ID, entityID) = extract(head) + new_head = construct_header(ID, entityID) + remainder = fix_scope_prefix(remainder) + new_text = new_head + remainder + + if not args: + outfile = sys.stdout + elif len(args) == 1: + outfile = file(args[0], "w") + elif len(args) == 2: + outfile = file(args[1], "w") + else: + usage() + sys.exit(1) + + outfile.write(new_text) + outfile.close() + +if __name__ == "__main__": + main(sys.argv[1:]) +