## This script runs against an OR version of OCLC's WorldCat Identities service ## Given a targetKey and startKey it will attempt to find the shortest path from ## the start to target, ignoring all but personal names ## Send enquiries to Thom Hickey (hickey at oclc dot org) import sys, urllib, time from xml.etree import cElementTree as ET def normalizeKey(k): return k.replace(' ', '%20') def getName(nameEl): if not nameEl: return None name = [] for a in 'abcdq': sub = nameEl.findtext('sub'+a) if sub: name.append(sub) return ' '.join(name) #startKey = raw_input('Starting key: ') #targetKey = raw_input('Ending key: ') ##targetKey = 'lccn-n82-54463' # lorcan ##startKey = 'lccn-n78-51773' # sally targetKey = 'lccn-n79-21164' # twain startKey = 'lccn-n79-32879' # austen ##targetKey = 'lccn-n88-34930' # kevin bacon toSearch = [startKey] visited = {startKey:''} path = [] nvisited = 0 while toSearch: pnkey = toSearch.pop(0) path = visited.get(pnkey,'') time.sleep(0.1) # try not to be too hard on the server pageObj = urllib.urlopen('http://orlabs.oclc.org/Identities/'+pnkey) xmltext = pageObj.read() try: el = ET.fromstring(xmltext) except: print 'problem parsing', pnkey continue typeEl = el.find('.//Identity') try: if typeEl.attrib.get('type')!='personal': continue except: print 'Failed on', pnkey print path.encode('utf-8') continue nvisited += 1 if nvisited%10==0: print '\r', 1+path.count('|'), nvisited, name = getName(el.find('.//nameInfo/latinName')) if not name: name = getName(el.find('.//nameInfo/rawName')) if not name: print 'unable to find name for ', pnkey print 'path', path continue path += '|'+name if nvisited%1000==0: print '\n', path.encode('utf-8') associatedKeys = el.findall('.//associatedNames/name/normName') if associatedKeys: for k in associatedKeys: nk = normalizeKey(k.text) if nk==targetKey: print '\nFound target:', path print 'Visited %d pages'%nvisited toSearch = [] break if not nk in visited: toSearch.append(nk) visited[nk] = path ##Copyright [2008] [OCLC] ##Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at ##http://www.apache.org/licenses/LICENSE-2.0 ##Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.