diff --git a/doc/get-rivethd-marcxml b/doc/get-rivethd-marcxml --- a/doc/get-rivethd-marcxml +++ b/doc/get-rivethd-marcxml @@ -1,96 +1,96 @@ #! /usr/bin/env python """\ %prog [ ...] """ from __future__ import print_function import argparse ap = argparse.ArgumentParser(usage=__doc__) ap.add_argument("XMLFILES", metavar='file', nargs="+", help="Inspire-exported MARC XML file to read") args = ap.parse_args() if not args.XMLFILES: exit(1) import rivet anas = {} for aname in rivet.AnalysisLoader.analysisNames(): ana = rivet.AnalysisLoader.getAnalysis(aname) # TODO: all anas *should* have an Inspire ID... anas.setdefault(ana.inspireId(), []).append(ana.name()) del anas[""] # print(anas.keys()) from lxml import etree #as ET #nsm = {None : "http://www.loc.gov/MARC21/slim"} nsm = {"mx" : "http://www.loc.gov/MARC21/slim"} for ifile, xmlfile in enumerate(args.XMLFILES): import urllib2, re, os if ifile > 0: print() print("Processing {}...".format(xmlfile)) jsonfile = xmlfile.replace(".marc.xml", ".json") if os.path.exists(jsonfile): print("{} exists, skipping {} testing".format(jsonfile, xmlfile)) continue OUT = {} tree = etree.parse(xmlfile) recnodes = tree.findall("/mx:record", namespaces=nsm) for rn in recnodes: print() # ctl = rn.find("./mx:controlfield", namespaces=nsm) ctls = rn.xpath("./mx:controlfield[@tag=001]", namespaces=nsm) ins = ctls[0].text if ctls else None print("Inspire", ins) summs = rn.xpath("./mx:datafield[@tag='245']/mx:subfield[@code='a']", namespaces=nsm) summ = summs[0].text if summs else None print("Title:", summ) expts = rn.xpath("./mx:datafield[@tag='710']/mx:subfield[@code='g']", namespaces=nsm) expt = expts[0].text if expts else None print("Expt:", expt) dois = rn.xpath("./mx:datafield[@tag='024']/mx:subfield[(@code='2') and (text()='DOI')]/../mx:subfield[@code='a']", namespaces=nsm) doi = dois[0].text.replace("oai:arXiv.org:", "") if dois else None print("DOI:", doi) arxs = rn.xpath("./mx:datafield[@tag='035']/mx:subfield[(@code='9') and (text()='arXiv')]/../mx:subfield[@code='a']", namespaces=nsm) arx = arxs[0].text.replace("oai:arXiv.org:", "") if arxs else None print("arXiv:", arx) cdss = rn.xpath("./mx:datafield[@tag='035']/mx:subfield[(@code='9') and (text()='CDS')]/../mx:subfield[@code='a']", namespaces=nsm) cds = cdss[0].text if cdss else None print("CDS:", cds) hds = rn.xpath("./mx:datafield[@tag='035']/mx:subfield[(@code='9') and (text()='HEPDATA')]/../mx:subfield[@code='a']", namespaces=nsm) hd = hds[0].text if hds else None if not hd: try: hdurl = "https://hepdata.net/record/ins{}".format(ins) u = urllib2.urlopen(hdurl) hd = "ins{}".format(ins) except urllib2.URLError: pass print("HepData:", hd) rivetanas = anas.get(ins) - nums = rn.xpath("./mx:datafield[@tag='037']/mx:subfield[(@code='9') and (text()='arXiv:reportnumber')]/../mx:subfield[@code='a']", namespaces=nsm) - reportnums = [num.text for num in nums] + nums = rn.xpath("./mx:datafield[@tag='037']/mx:subfield[@code='a']", namespaces=nsm) + reportnums = [num.text for num in nums if not 'arXiv' in num.text] print("Nums:", reportnums) OUT[ins] = [summ, expt, doi, cds, arx, hd, rivetanas, reportnums] print(OUT[ins]) ## Write out as JSON import json with open(jsonfile, "wb") as jf: json.dump(OUT, jf)