Source code for lodstorage.sparql

'''
Created on 2020-08-14

@author: wf
'''
from SPARQLWrapper import SPARQLWrapper2
from SPARQLWrapper.Wrapper import POSTDIRECTLY, POST
from lodstorage.lod import LOD
import datetime
import time
from sys import stderr

[docs]class SPARQL(object): ''' wrapper for SPARQL e.g. Apache Jena, Virtuoso, Blazegraph :ivar url: full endpoint url (including mode) :ivar mode: 'query' or 'update' :ivar debug: True if debugging is active :ivar typedLiterals: True if INSERT should be done with typedLiterals :ivar profile(boolean): True if profiling / timing information should be displayed :ivar sparql: the SPARQLWrapper2 instance to be used :ivar method(str): the HTTP method to be used 'POST' or 'GET' ''' def __init__(self,url,mode='query', debug=False, typedLiterals=False, profile=False, agent='PyLodStorage',method='POST'): ''' Constructor a SPARQL wrapper Args: url(string): the base URL of the endpoint - the mode query/update is going to be appended mode(string): 'query' or 'update' debug(bool): True if debugging is to be activated typedLiterals(bool): True if INSERT should be done with typedLiterals profile(boolean): True if profiling / timing information should be displayed agent(string): the User agent to use method(string): the HTTP method to be used 'POST' or 'GET' ''' self.url="url%s" % (mode) self.mode=mode self.debug=debug self.typedLiterals=typedLiterals self.profile=profile self.sparql=SPARQLWrapper2(url) self.method=method self.sparql.agent=agent
[docs] def rawQuery(self,queryString,method='POST'): ''' query with the given query string Args: queryString(string): the SPARQL query to be performed method(string): POST or GET - POST is mandatory for update queries Returns: list: the raw query result as bindings ''' self.sparql.setQuery(queryString) self.sparql.method=method queryResult = self.sparql.query() return queryResult
[docs] def getValue(self,sparqlQuery:str,attr:str): ''' get the value for the given SPARQL query using the given attr Args: sparql(SPARQL): the SPARQL endpoint to ge the value for sparqlQuery(str): the SPARQL query to run attr(str): the attribute to get ''' if self.debug: print(sparqlQuery) qLod=self.queryAsListOfDicts(sparqlQuery) return self.getFirst(qLod, attr)
[docs] def getValues(self,sparqlQuery:str,attrList:list): ''' get Values for the given sparlQuery and attribute list ''' if self.debug: print(sparqlQuery) qLod=self.queryAsListOfDicts(sparqlQuery) if not (len(qLod)==1): msg=f"getValues for {attrList} failed for {qLod}" raise Exception(msg) record=qLod[0] values=() for attr in attrList: if not attr in record: msg=f"getValues failed for attribute {attr} which is missing in result record {record}" raise Exception(msg) recordTuple=(record[attr],) values+=recordTuple return values
[docs] def getFirst(self,qLod:list,attr:str): ''' get the column attr of the first row of the given qLod list Args: qLod(list): the list of dicts (returned by a query) attr(str): the attribute to retrieve Returns: object: the value ''' if len(qLod)==1 and attr in qLod[0]: value=qLod[0][attr] return value raise Exception(f"getFirst for attribute {attr} failed for {qLod}")
[docs] def getResults(self,jsonResult): ''' get the result from the given jsonResult Args: jsonResult: the JSON encoded result Returns: list: the list of bindings ''' return jsonResult.bindings
[docs] def insert(self,insertCommand): ''' run an insert Args: insertCommand(string): the SPARQL INSERT command Returns: a response ''' self.sparql.setRequestMethod(POSTDIRECTLY) response=None exception=None try: response=self.rawQuery(insertCommand, method=POST) #see https://github.com/RDFLib/sparqlwrapper/issues/159#issuecomment-674523696 # dummy read the body response.response.read() except Exception as ex: exception=ex if self.debug: print (ex) return response,exception
[docs] def getLocalName(self,name): ''' retrieve valid localname from a string based primary key https://www.w3.org/TR/sparql11-query/#prefNames Args: name(string): the name to convert Returns: string: a valid local name ''' localName=''.join(ch for ch in name if ch.isalnum()) return localName
[docs] def insertListOfDicts(self,listOfDicts,entityType,primaryKey,prefixes,limit=None,batchSize=None, profile=False): ''' insert the given list of dicts mapping datatypes Args: entityType(string): the entityType to use as a primaryKey(string): the name of the primary key attribute to use prefix(string): any PREFIX statements to be used limit(int): maximum number of records to insert batchSize(int): number of records to send per request Return: a list of errors which should be empty on full success datatype maping according to https://www.w3.org/TR/xmlschema-2/#built-in-datatypes mapped from https://docs.python.org/3/library/stdtypes.html compare to https://www.w3.org/2001/sw/rdb2rdf/directGraph/ http://www.bobdc.com/blog/json2rdf/ https://www.w3.org/TR/json-ld11-api/#data-round-tripping https://stackoverflow.com/questions/29030231/json-to-rdf-xml-file-in-python ''' if limit is not None: listOfDicts=listOfDicts[:limit] else: limit=len(listOfDicts) total=len(listOfDicts) if batchSize is None: return self.insertListOfDictsBatch(listOfDicts, entityType, primaryKey, prefixes,total=total) else: startTime=time.time() errors=[] # store the list in batches for i in range(0, total, batchSize): recordBatch=listOfDicts[i:i+batchSize] batchErrors=self.insertListOfDictsBatch(recordBatch, entityType, primaryKey, prefixes,batchIndex=i,total=total,startTime=startTime) errors.extend(batchErrors) if self.profile: print("insertListOfDicts for %9d records in %6.1f secs" % (len(listOfDicts),time.time()-startTime),flush=True) return errors
[docs] def insertListOfDictsBatch(self,listOfDicts,entityType,primaryKey,prefixes,title='batch',batchIndex=None,total=None,startTime=None): ''' insert a Batch part of listOfDicts Args: entityType(string): the entityType to use as a primaryKey(string): the name of the primary key attribute to use prefix(string): any PREFIX statements to be used title(string): the title to display for the profiling (if any) batchIndex(int): the start index of the current batch total(int): the total number of records for all batches starttime(datetime): the start of the batch processing Return: a list of errors which should be empty on full success ''' errors=[] size=len(listOfDicts) if batchIndex is None: batchIndex=0 batchStartTime=time.time() if startTime is None: startTime=batchStartTime rdfprefix="PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>\n" insertCommand=f'{rdfprefix}{prefixes}\nINSERT DATA {{\n' for index,record in enumerate(listOfDicts): if not primaryKey in record: errors.append(f"missing primary key {primaryKey} in record {index}") else: primaryValue=record[primaryKey] if primaryValue is None: errors.append(f"primary key {primaryKey} value is None in record {index}") else: encodedPrimaryValue=self.getLocalName(primaryValue) tSubject=f"{entityType}__{encodedPrimaryValue}" insertCommand+=f' {tSubject} rdf:type "{entityType}".\n' for keyValue in record.items(): key,value=keyValue # convert key if necessary key=self.getLocalName(key) valueType=type(value) if self.debug: print("%s(%s)=%s" % (key,valueType,value)) tPredicate=f"{entityType}_{key}" tObject=value if valueType == str: escapedString=self.controlEscape(value) tObject='"%s"' % escapedString elif valueType==int: if self.typedLiterals: tObject='"%d"^^<http://www.w3.org/2001/XMLSchema#integer>' %value pass elif valueType==float: if self.typedLiterals: tObject='"%s"^^<http://www.w3.org/2001/XMLSchema#decimal>' %value pass elif valueType==bool: pass elif valueType==datetime.date: #if self.typedLiterals: tObject='"%s"^^<http://www.w3.org/2001/XMLSchema#date>' %value pass elif valueType==datetime.datetime: tObject='"%s"^^<http://www.w3.org/2001/XMLSchema#dateTime>' %value pass else: errors.append("can't handle type %s in record %d" % (valueType,index)) tObject=None if tObject is not None: insertRecord=' %s %s %s.\n' % (tSubject,tPredicate,tObject) insertCommand+=insertRecord insertCommand+="\n}" if self.debug: print (insertCommand,flush=True) response,ex=self.insert(insertCommand) if response is None and ex is not None: errors.append("%s for record %d" % (str(ex),index)) if self.profile: print("%7s for %9d - %9d of %9d %s in %6.1f s -> %6.1f s" % (title,batchIndex+1,batchIndex+size,total,entityType,time.time()-batchStartTime,time.time()-startTime),flush=True) return errors
controlChars = [chr(c) for c in range(0x20)]
[docs] @staticmethod def controlEscape(s): ''' escape control characters see https://stackoverflow.com/a/9778992/1497139 ''' escaped=u''.join([c.encode('unicode_escape').decode('ascii') if c in SPARQL.controlChars else c for c in s]) escaped=escaped.replace('"','\\"') return escaped
[docs] def query(self,queryString,method=POST): ''' get a list of results for the given query Args: queryString(string): the SPARQL query to execute method(string): the method eg. POST to use Returns: list: list of bindings ''' queryResult=self.rawQuery(queryString,method=method) if self.debug: print(queryString) if hasattr(queryResult, "info"): if "content-type" in queryResult.info(): ct = queryResult.info()["content-type"] if "text/html" in ct: response=queryResult.response.read().decode() if not "Success" in response: raise("%s failed: %s", response) return None jsonResult=queryResult.convert() return self.getResults(jsonResult)
[docs] def queryAsListOfDicts(self,queryString,fixNone:bool=False,sampleCount:int=None): ''' get a list of dicts for the given query (to allow round-trip results for insertListOfDicts) Args: queryString(string): the SPARQL query to execute fixNone(bool): if True add None values for empty columns in Dict sampleCount(int): the number of samples to check Returns: list: a list ofDicts ''' records=self.query(queryString,method=self.method) listOfDicts=self.asListOfDicts(records,fixNone=fixNone,sampleCount=sampleCount) return listOfDicts
[docs] @staticmethod def strToDatetime(value,debug=False): ''' convert a string to a datetime Args: value(str): the value to convert Returns: datetime: the datetime ''' dateFormat="%Y-%m-%d %H:%M:%S.%f" if "T" in value and "Z" in value: dateFormat="%Y-%m-%dT%H:%M:%SZ" dt=None try: dt=datetime.datetime.strptime(value,dateFormat) except ValueError as ve: if debug: print(str(ve)) return dt
[docs] def asListOfDicts(self,records,fixNone:bool=False,sampleCount:int=None): ''' convert SPARQL result back to python native Args: record(list): the list of bindings fixNone(bool): if True add None values for empty columns in Dict sampleCount(int): the number of samples to check Returns: list: a list of Dicts ''' resultList=[] fields=None if fixNone: fields=LOD.getFields(records, sampleCount) for record in records: resultDict={} for keyValue in record.items(): key,value=keyValue datatype=value.datatype if datatype is not None: if datatype=="http://www.w3.org/2001/XMLSchema#integer": resultValue=int(value.value) elif datatype=="http://www.w3.org/2001/XMLSchema#decimal": resultValue=float(value.value) elif datatype=="http://www.w3.org/2001/XMLSchema#boolean": resultValue=value.value in ['TRUE','true'] elif datatype=="http://www.w3.org/2001/XMLSchema#date": dt=datetime.datetime.strptime(value.value,"%Y-%m-%d") resultValue=dt.date() elif datatype=="http://www.w3.org/2001/XMLSchema#dateTime": dt=SPARQL.strToDatetime(value.value,debug=self.debug) resultValue=dt else: # unsupported datatype resultValue=value.value else: resultValue=value.value resultDict[key]=resultValue if fixNone: for field in fields: if not field in resultDict: resultDict[field]=None resultList.append(resultDict) return resultList
[docs] def printErrors(self,errors): ''' print the given list of errors Args: errors(list): a list of error strings Returns: boolean: True if the list is empty else false ''' if len(errors)>0: print("ERRORS:") for error in errors: print(error,flush=True,file=stderr) return True else: return False