Source code for PyFoam.Basics.SpreadsheetData

# coding: utf-8

#  ICE Revision: $Id: $
"""
Data that can go into a spreadsheet (title line and rectangular data)
"""

try:
    import numpy
except ImportError:
    # assume this is pypy and retry
    import numpypy
    import numpy

import copy
import re

from PyFoam.Error import error,FatalErrorPyFoamException,warning

from PyFoam.ThirdParty.six import PY3
from PyFoam.ThirdParty.six import b as toByte

[docs]class WrongDataSize(FatalErrorPyFoamException): def __init__(self,txt="Size of the arrays differs"): FatalErrorPyFoamException.__init__(self,txt)
[docs]class SpreadsheetData(object): """ Collects data that could go into a spreadsheet. The focus of this class is on storing all the data at once """ def __init__(self, timeName=None, validData=None, validMatchRegexp=False, csvName=None, txtName=None, excelName=None, data=None, names=None, isSampleFile=False, skip_header=0, stripCharacters=None, replaceFirstLine=None, title=None): """Either this is constructed from a file or from the data and the column headers :param timeName: the data colum that is to be considered the time in this file :param validData: names of the valid data columns (all others should be discarded) :param validMatchRegexp: Should the validData be interpreted as regular expressions :param csvName: name of the CSV-file the data should be constructed from, :param txtName: name of a file the data should be constructed from, :param excelName: name of a Excel-file the data should be constructed from (uses the first sheet in the file), :param data: the actual data to use :param names: the names for the column header :param isSampleFile: file produced by sample/set. Field names are determined from the filename :param stripCharacters: String with characters that should be removed before reading :param replaceFirstLine: String with a line that should replace the first line (usually to replace the header) :param title: a name that is used to make unique heades names""" def filterChars(fName): if "readlines" in dir(fName): f=fName else: f=open(fName) first=True for l in f.readlines(): if first and replaceFirstLine: l=replaceFirstLine+"\n" elif stripCharacters: l=l.translate(None,stripCharacters) first=False try: yield toByte(l) except AttributeError: yield l if "close" in dir(f): f.close() self.title=title nrFileSpec=len([1 for i in [csvName,txtName,excelName] if not i is None]) if (nrFileSpec>0) and not data is None: error("SpreadsheetData is either constructed from data or from a file") if data is None and nrFileSpec>1: error("Only one file specification allowed") if csvName: try: rec=numpy.recfromcsv(filterChars(csvName), names=True if names is None else names, skip_header=skip_header) data=[tuple(float(x) for x in i) for i in rec] names=list(rec.dtype.names) except AttributeError: # for old numpy-versions data=[tuple(d) for d in numpy.loadtxt(csvName, delimiter=',', skiprows=1)] names=open(csvName).readline().strip().split(',') # redo this to make sure that everything is float self.data=numpy.array(data,dtype=list(zip(names,['f8']*len(names)))) elif txtName: try: if isSampleFile: from os import path raw=numpy.recfromtxt(filterChars(txtName)) rawName=path.splitext(path.basename(txtName))[0].split("_")[1:] pData=[list(raw[:,0])] names=["coord"] if raw.shape[1]==len(rawName)+1: # scalars for i,n in enumerate(rawName): pData.append(list(raw[:,1+i])) names.append(n) elif raw.shape[1]==3*len(rawName)+1: for i,n in enumerate(rawName): for j,c in enumerate(["x","y","z"]): pData.append(list(raw[:,1+i*3+j])) names.append(n+"_"+c) vals=[raw[:,1+i*3+j] for j in range(3)] pData.append(list(numpy.sqrt(vals[0]*vals[0]+ vals[1]*vals[1]+ vals[2]*vals[2]))) names.append(n+"_mag") else: error("List of names",rawName,"does not fit number of colums", raw.shape[1],"should be",len(rawName)+1, "for scalars or",len(rawName)*3+1,"for vector") data=[tuple(v) for v in numpy.asarray(pData).T] else: rec=numpy.recfromtxt(filterChars(txtName),names=True) data=[tuple(float(x) for x in i) for i in rec] if names is None: names=list(rec.dtype.names) else: nr=len(list(rec.dtype.names)) if title is None: off=len(names)-nr+1 self.title="_".join(names[:off]) names=names[:off]+["index"]+names[off:] names=names[-nr:] except AttributeError: # for old numpy-versions data=[tuple(v) for v in numpy.loadtxt(txtName)] names=open(txtName).readline().strip().split()[1:] # redo this to make sure that everything is float self.data=numpy.array(data,dtype=list(zip(names,['f8']*len(names)))) elif excelName: import pandas rec=pandas.read_excel(excelName).to_records() data=[tuple(float(x) for x in i) for i in rec] names=list(rec.dtype.names) self.data=numpy.array(data,dtype=list(zip(names,['f8']*len(names)))) else: if data is not None and names is None: error("No names given for the data") types=[] for d in data[0]: try: float(d) types.append('f8') except ValueError: types.append('S') for i,t in enumerate(types): if t=="S": l=max(len(str(d[i])) for d in data)+1 types[i]="S%d" % l self.data=numpy.array([tuple(v) for v in data], dtype=list(zip(names,types))) if timeName: try: index=list(self.data.dtype.names).index(timeName) except ValueError: error("Time name",timeName,"not in",self.data.dtype.names) else: index=0 self.time=self.data.dtype.names[index] self.eliminatedNames=None if validData: usedData=[] usedNames=[] for n in self.data.dtype.names: if n==self.time or self.validName(n,validData,validMatchRegexp): usedData.append(tuple(self.data[n])) usedNames.append(n) self.eliminatedNames=set(self.data.dtype.names)-set(usedNames) usedData=numpy.array(usedData).transpose() self.data=numpy.array([tuple(v) for v in usedData], dtype=list(zip(usedNames,['f8']*len(usedNames)))) index=list(self.data.dtype.names).index(self.time) if self.title!=None: self.data.dtype.names=[self.title+" "+x for x in self.data.dtype.names[0:index]]+[self.data.dtype.names[index]]+[self.title+" "+x for x in self.data.dtype.names[index+1:]]
[docs] def validName(self,n,validData,validMatchRegexp=False): if n in validData: return True elif validMatchRegexp: for r in validData: exp=None try: exp=re.compile(r) except: pass if not exp is None: if exp.search(n): return True return False
[docs] def names(self,withTime=True): if withTime: return copy.copy(self.data.dtype.names) else: ind=self.data.dtype.names.index(self.timeName()) return self.data.dtype.names[:ind]+self.data.dtype.names[ind+1:]
[docs] def timeName(self): return self.time
[docs] def rename(self,f,renameTime=False): """Rename all the columns according to a function. Time only if specified""" newNames=[] for c in self.data.dtype.names: if not renameTime and c==self.time: newNames.append(c) else: newNames.append(f(c)) if c==self.time: self.time=newNames[-1] self.data.dtype.names=newNames
[docs] def size(self): return self.data.size
[docs] def writeCSV(self,fName, delimiter=","): """Write data to a CSV-file :param fName: Name of the file :param delimiter: Delimiter to be used in the CSV-file""" f=open(fName,"wb") if PY3: f.write(toByte(delimiter.join(self.names())+"\n")) else: f.write(delimiter.join(self.names())+"\n") numpy.savetxt(f,self.data,delimiter=delimiter) f.close()
[docs] def tRange(self,time=None): """Return the range of times :param time: name of the time. If None the first column is used""" if time==None: time=self.time t=self.data[time] return (t[0],t[-1])
[docs] def join(self,other,time=None,prefix=None): """Join this object with another. Assume that they have the same amount of rows and that they have one column that designates the time and is called the same and has the same values :param other: the other array :param time: name of the time. If None the first column is used :param prefix: String that is added to the other names. If none is given then the title is used""" if time==None: time=self.time if prefix==None: prefix=other.title if prefix==None: prefix="other_" else: prefix+="_" t1=self.data[time] t2=other.data[time] if len(t1)!=len(t2): raise WrongDataSize() if max(abs(t1-t2))>1e-10: raise WrongDataSize("Times do not have the same values") names=[] data=[] for n in self.names(): names.append(n) data.append(self.data[n]) for n in other.names(): if n!=time: if n in self.names(): names.append(prefix+n) else: names.append(n) data.append(other.data[n]) return SpreadsheetData(names=names, data=numpy.array(data).transpose())
def __add__(self,other): """Convinience function for joining data""" return self.join(other)
[docs] def recalcData(self,name,expr,create=False): """Recalc or add a column to the data :param name: the colum (must exist if it is not created. Otherwise it must not exist) :param expr: the expression to calculate. All present column names are usable as variables. There is also a variable data for subscripting if the data is not a valid variable name. If the column is not create then there is also a variable this that is an alias for the name :param create: whether a new data item should be created""" if create and name in self.names(): error("Item",name,"already exists in names",self.names()) elif not create and not name in self.names(): error("Item",name,"not in names",self.names()) result=eval(expr,dict([(n,self.data[n]) for n in self.names()]+[("data",self.data)]+ ([("this",self.data[name] if not create else [])]))) if not create: self.data[name]=result else: self.append(name,result)
[docs] def append(self, name, data, allowDuplicates=False): """Add another column to the data. Assumes that the number of rows is right :param name: the name of the column :param data: the actual data :param allowDuplicates: If the name already exists make it unique by appending _1, _2 ...""" arr = numpy.asarray(data) newname=name if newname in self.names() and allowDuplicates: cnt=1 while newname in self.names(): newname="%s_%d" % (name,cnt) cnt+=1 warning("Changing name",name,"to",newname,"bacause it already exists in the data") newdtype = numpy.dtype(self.data.dtype.descr + [(newname, 'f8')]) newrec = numpy.empty(self.data.shape, dtype=newdtype) for field in self.data.dtype.fields: newrec[field] = self.data[field] newrec[name] = arr self.data=newrec
def __call__(self, t, name=None, time=None, invalidExtend=False, noInterpolation=False): """'Evaluate' the data at a specific time by linear interpolation :param t: the time at which the data should be evaluated :param name: name of the data column to be evaluated. Assumes that that column is ordered in ascending order. If unspecified a dictionary with the values from all columns is returned :param time: name of the time column. If none is given then the first column is assumed :param invalidExtend: if t is out of the valid range then use the smallest or the biggest value. If False use nan :param noInterpolation: if t doesn't exactly fit a data-point return 'nan'""" if time==None: time=self.time if name is None: result={} for n in self.names(): if n!=time: result[n]=self(t, name=n, time=time, invalidExtend=invalidExtend, noInterpolation=noInterpolation) return result x=self.data[time] y=self.data[name] isString=y.dtype!=numpy.float64 # get extremes if t<x[0]: if invalidExtend: return y[0] else: return float('nan') if not isString else "" elif t>x[-1]: if invalidExtend: return y[-1] else: return float('nan') if not isString else "" if noInterpolation: if t==x[0]: return y[0] elif t==x[-1]: return y[-1] iLow=0 iHigh=len(x)-1 while (iHigh-iLow)>1: iNew = iLow + int((iHigh-iLow)/2) if x[iNew]==t: # we got lucky return y[iNew] elif t < x[iNew]: iHigh=iNew else: iLow=iNew if noInterpolation: return float('nan') if not isString else "" else: if isString: return y[iLow] if (t-x[iLow])/(x[iHigh]-x[iLow])<0.5 else y[iHigh] else: return y[iLow] + (y[iHigh]-y[iLow])*(t-x[iLow])/(x[iHigh]-x[iLow])
[docs] def addTimes(self,times,time=None,interpolate=False,invalidExtend=False): """Extend the data so that all new times are represented (add rows if they are not there) :param time: the name of the column with the time :param times: the times that shoild be there :param interpolate: interpolate the data in new rows. Otherwise insert 'nan' :param invalidExtend: if t is out of the valid range then use the smallest or the biggest value. If False use nan""" if time==None: time=self.time if len(times)==len(self.data[time]): same=True for i in range(len(times)): if times[i]!=self.data[time][i]: same=False break if same: # No difference between the times return newData=[] otherI=0 originalI=0 while otherI<len(times): goOn=originalI<len(self.data[time]) while goOn and times[otherI]>self.data[time][originalI]: newData.append(self.data[originalI]) originalI+=1 goOn=originalI<len(self.data[time]) append=True if originalI<len(self.data[time]): if times[otherI]==self.data[time][originalI]: newData.append(self.data[originalI]) originalI+=1 otherI+=1 append=False if append: t=times[otherI] newRow=[] for n in self.names(): if n==time: newRow.append(t) elif interpolate: newRow.append(self(t,n,time=time,invalidExtend=invalidExtend)) else: newRow.append(float('nan')) newData.append(newRow) otherI+=1 while originalI<len(self.data[time]): newData.append(self.data[originalI]) originalI+=1 self.data=numpy.array([tuple(v) for v in newData],dtype=self.data.dtype)
[docs] def resample(self, other, name, otherName=None, time=None, invalidExtend=False, extendData=False, noInterpolation=False): """Calculate values from another dataset at the same times as in this data-set :param other: the other data-set :param name: name of the data column to be evaluated. Assumes that that column is ordered in ascending order :param time: name of the time column. If none is given then the first column is assumed :param invalidExtend: see __call__ :param extendData: if the time range of x is bigger than the range then extend the range before resampling :param noInterpolation: if t doesn't exactly fit a data-point return 'nan'""" if time==None: time=self.time if extendData and ( self.data[time][0] > other.data[time][0] or \ self.data[time][-1] < other.data[time][-1]): pre=[] i=0 while other.data[time][i] < self.data[time][0]: data=[] for n in self.names(): if n==time: data.append(other.data[time][i]) else: data.append(float('nan')) pre.append(data) i+=1 if i>=len(other.data[time]): break if len(pre)>0: self.data=numpy.concatenate((numpy.array([tuple(v) for v in pre], dtype=self.data.dtype), self.data)) post=[] i=-1 while other.data[time][i] > self.data[time][-1]: data=[] for n in self.names(): if n==time: data.append(other.data[time][i]) else: data.append(float('nan')) post.append(data) i-=1 if abs(i)>=len(other.data[time])+1: break post.reverse() if len(post)>0: self.data=numpy.concatenate((self.data,numpy.array([tuple(p) for p in post], dtype=self.data.dtype))) result=[] for t in self.data[time]: nm=name if otherName: nm=otherName result.append(other(t,nm, time=time, invalidExtend=invalidExtend, noInterpolation=noInterpolation)) return result
[docs] def compare(self, other, name, otherName=None, time=None, common=False, minTime=None, maxTime=None): """Compare this data-set with another. The time-points of this dataset are used as a reference. Returns a dictionary with a number of norms: maximum absolute difference, average absolute difference on all timepoints, average absolute difference weighted by time :param other: the other data-set :param name: name of the data column to be evaluated. Assumes that that column is ordered in ascending order :param time: name of the time column. If none is given then the first column is assumed :param common: cut off the parts where not both data sets are defined :param minTime: first time which should be compared :param maxTime: last time to compare""" if time==None: time=self.time x=self.data[time] y=self.data[name] y2=self.resample(other,name,otherName=otherName,time=time,invalidExtend=True) minT,maxT=minTime,maxTime if common: minTmp,maxTmp=max(x[0],other.data[time][0]),min(x[-1],other.data[time][-1]) for i in range(len(x)): if minTmp<=x[i]: minT=x[i] break for i in range(len(x)): val=x[-(i+1)] if maxTmp>=val: maxT=val break else: minT,maxT=x[0],x[-1] result = { "max" : None, "maxPos" : None, "average" : None, "wAverage" : None, "tMin": None, "tMax": None } if minT==None or maxT==None: return result if minTime: if minTime>minT: minT=minTime if maxTime: if maxTime<maxT: maxT=maxTime if maxT<minT: return result maxDiff=0 maxPos=x[0] sumDiff=0 sumWeighted=0 cnt=0 for i,t in enumerate(x): if t<minT or t>maxT: continue cnt+=1 val1=y[i] val2=y2[i] diff=abs(val1-val2) if diff>maxDiff: maxDiff=diff maxPos=x[i] sumDiff+=diff weight=0 if t>minT: weight+=(t-x[i-1])/2 if t<maxT: weight+=(x[i+1]-t)/2 sumWeighted+=weight*diff return { "max" : maxDiff, "maxPos" : maxPos, "average" : sumDiff/cnt, "wAverage" : sumWeighted/(maxT-minT), "tMin": minT, "tMax": maxT}
[docs] def metrics(self, name, time=None, minTime=None, maxTime=None): """Calculates the metrics for a data set. Returns a dictionary with a number of norms: minimum, maximum, average, average weighted by time :param name: name of the data column to be evaluated. Assumes that that column is ordered in ascending order :param time: name of the time column. If none is given then the first column is assumed :param minTime: first time to take metrics from :param maxTime: latest time to take matrics from""" if time==None: time=self.time x=self.data[time] y=self.data[name] minVal=1e40 maxVal=-1e40 sum=0 sumWeighted=0 minT,maxT=x[0],x[-1] if minTime: if minTime>minT: minT=minTime if maxTime: if maxTime<maxT: maxT=maxTime cnt=0 for i,t in enumerate(x): if t<minT or t>maxT: continue cnt+=1 val=y[i] maxVal=max(val,maxVal) minVal=min(val,minVal) sum+=val weight=0 if i>0: weight+=(t-x[i-1])/2 if i<(len(x)-1): weight+=(x[i+1]-t)/2 sumWeighted+=weight*val return { "max" : maxVal, "min" : minVal, "average" : sum/max(cnt,1), "wAverage" : sumWeighted/(maxT-minT), "tMin": x[0], "tMax": x[-1]}
[docs] def getData(self,reindex=True): """Return a dictionary of the data in the DataFrame format of pandas :param: drop duplicate times (setting it to False might break certain Pandas-operations)""" try: from PyFoam.Wrappers.Pandas import PyFoamDataFrame except ImportError: warning("No pandas-library installed. Returning None") return None return PyFoamDataFrame(self.getSeries(reindex=reindex))
[docs] def getSeries(self,reindex=True): """Return a dictionary of the data-columns in the Series format of pandas :param: drop duplicate times (setting it to False might break certain Pandas-operations)""" try: import pandas except ImportError: warning("No pandas-library installed. Returning None") return None data={} if reindex: realindex=numpy.unique(self.data[self.time]) for n in self.names(): if n!=self.time: data[n]=pandas.Series(self.data[n], index=self.data[self.time], name=n) if reindex: if len(data[n])!=len(realindex): try: data[n].axes[0].is_unique=True except: # Newer Pandas versions don't allow setting this. Just drop duplicates data[n]=data[n].drop_duplicates() data[n]=data[n].reindex_axis(realindex) return data
# Should work with Python3 and Python2