Coverage for /home/agp/Documents/me/code/gutools/gutools/utests.py : 20%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""Parse CSV alike data from different sources including:
3- markdown tables (e.g. from documentation)
6Future:
7- csv
8- excel
10"""
11import asyncio
12import inspect
13import sys
14import time
15import re
16import pandas as pd
17from datetime import timedelta as timedelta
18from datetime import datetime
20from io import StringIO
21# import uuid
22import random
23import yaml
25import parse # https://github.com/r1chardj0n3s/parse
27from gutools.tools import get_calling_function
29_regx_eval = re.compile(r"(\<(.*)\>)")
31_regx_param = re.compile(r"""
32(?P<idx>[\*\s]+)?
33(?P<format>
34\{?
35 (?P<param>[_a-z][^:\*\}]*)
36 (:(?P<fmt>[^\*\}]*))?
37\}?
38)
39(?P<idx2>[\*\s]+)?
41""", re.VERBOSE | re.IGNORECASE)
43m = _regx_param.match('*{seq:d}*')
44print(m.groupdict())
45m = _regx_param.match('{seq:d}')
46print(m.groupdict())
47m = _regx_param.match('*seq*')
48print(m.groupdict())
49m = _regx_param.match('seq')
50print(m.groupdict())
51m = _regx_param.match('{bid1}')
52print(m.groupdict())
53m = _regx_param.match('{_foo}')
54print(m.groupdict())
55m = _regx_param.match('{0}')
56assert not m
59default_converters = {'seq': int, 'p0': float, 'p1': float, 'date': datetime.fromisoformat,
60 'type': float, 'amount': float, 'price': float,
61 'lid': int, 'exp': float, 'fill': float,
62 'uf': int, 'cf': int, 'cd': int, 'bid': int, }
64# --------------------------------------------------------
65# Markdown tables extractor
66# --------------------------------------------------------
68def apply_converters(row, conv, nan_error=False):
69 for i, c in conv:
70 try:
71 row[i] = c(row[i])
72 except:
73 if nan_error:
74 row[i] = pd.np.NaN
76 # always convert NaN strings to np.nan
77 for i, x in enumerate(row):
78 if x in ('NaN', ):
79 row[i] = pd.np.NaN
80 return row
83def parse_fields(row, header, env, nan_error=False):
84 # 1 step: parse all rows known format and update context
85 idx_pos = None
86 for i, fmt in enumerate(header):
87 try:
88 d = _regx_param.match(fmt).groupdict()
89 info = parse.parse(d['format'], row[i])
90 if info:
91 env.update(info.named)
92 # we asume there's only 1 named params or at least, the result is the last one
93 for key, value in info.named.items():
94 row[i] = value
95 elif d['idx']:
96 return None # is an index and we can not parse value
98 except Exception as why:
99 if nan_error:
100 row[i] = pd.np.NaN
102 for i, t_value in enumerate(row):
103 if isinstance(t_value, str):
104 m = _regx_eval.match(t_value)
105 if m:
106 exp = m.group(2)
107 try:
108 row[i] = eval(exp, env)
109 except Exception as why:
110 foo = 1
111 return row
113def prepare_converters(columns, converters):
114 conv = list()
115 for i, field in enumerate(columns):
116 c = converters.get(field)
117 if c:
118 conv.append((i, c))
119 return conv
121def parse_df_header(header):
122 if isinstance(header, str):
123 header = [scalar.strip() for scalar in header.strip().strip('|').split('|')]
125 idx0 = idx1 = None
126 header0 = list(header)
128 for i, col in enumerate(header0):
129 m = _regx_param.match(col)
130 if m:
131 d = m.groupdict()
132 header0[i] = d['param']
133 if d['idx']:
134 idx0, idx1 = col, d['param']
135 else:
136 header0[i] = col
138 header1 = list(header0)
139 if idx1:
140 header1.remove(idx1)
142 return idx0, idx1, header0, header1
144def set_df_index(df, header):
145 idx0, idx1, _, _ = parse_df_header(header)
146 if idx1:
147 df.rename({idx0: idx1}, axis='columns', inplace=True)
148 df.set_index(idx1, inplace=True)
151def Markdown_extractor(content, converters=default_converters, env=None, nan_error=False, what=['table']):
152 """Extact elements from MD formated content."""
153 last = None
154 context = dict()
155 header_fmt = header0 = header1 = None
156 env = env if env is not None else dict()
158 stream = StringIO(content)
160 while not stream.closed:
161 rows = table_reader(stream)
162 data = list()
163 for header_fmt in rows:
164 idx0, idx1, header0, header1 = parse_df_header(header_fmt)
165 break
166 else:
167 break # not header found, stop reading
169 for row in rows:
170 row = parse_fields(row, header_fmt, env)
171 row and data.append(row)
173 df = pd.DataFrame(data, columns=header0)
174 if idx1:
175 df.set_index(idx1, inplace=True)
176 yield df
178def table_reader(stream):
179 """Parse a table in MD format.
180 Returns header and all rows, then exit.
181 The stream is moved forward to next MD content.
182 """
183 header = None
184 for line in stream.readlines():
185 row = [scalar.strip() for scalar in line.strip().strip('|').split('|')]
187 # check if is a header start
188 c = ''.join(row)
189 if not header:
190 if c:
191 if not c.strip('-').strip(':'):
192 header = last
193 yield header
194 else:
195 if line.strip():
196 yield row
197 else:
198 break
199 last = row
201def df_asdict(df):
202 name = df.index.name
203 for idx, serie in df.iterrows():
204 d = dict(zip(serie.index, serie.values))
205 d[name] = idx
206 yield d
208def list_asdict(lines, header):
209 for row in lines:
210 d = dict(zip(header, row))
211 yield d
213def twin_iter(a, b):
214 keys = list(a.keys())
215 keys.sort()
216 for k in keys:
217 yield k, a[k], b[k]
219class Match(object):
220 def __init__(self, df, content, env):
221 self.df = df
222 # self.df.reset_index() # to add index into context values
223 self.content = content
224 self.env = env
226 def match(self):
227 for step in ['capture', 'eval']:
228 stream = StringIO(self.content)
229 lines = table_reader(stream)
231 for header in lines:
232 idx0, idx1, header0, header1 = parse_df_header(header)
233 break
234 env = self.env
236 # iterate df rows and template rows one by one
237 for i, (e_row, t_row) in enumerate(
238 zip(df_asdict(self.df), list_asdict(lines, header0))):
240 a, b = list(e_row.keys()), list(t_row.keys())
241 a.sort()
242 b.sort()
243 assert a == b
245 # pass 1: cast to same type and capture
246 env.update(e_row)
248 pending = list()
249 for key, e_value, t_value in twin_iter(e_row, t_row):
250 # - ignore <> expressions
251 m = _regx_eval.match(t_value)
252 if m:
253 pending.append((key, m.group(2)))
254 continue
256 # - try to capture variable from e_row
257 m = _regx_param.match(t_value)
258 if m:
259 d = m.groupdict()
260 env[d['param']] = t_row[key] = e_row[key]
261 continue
263 # - try to cast value to same class. NaN is convertted as well
264 # e_value = e_row[key]
265 try:
266 t_row[key] = e_value.__class__(t_value)
267 continue
268 except Exception as why:
269 print(why)
270 foo = 1
272 if step in ('eval', ):
273 # pass 2: expand <vars>
274 for key, exp in pending:
275 try:
276 t_row[key] = eval(exp, env)
277 except Exception as why:
278 foo = 1
280 # compare
281 if t_row != e_row:
282 diff = list()
283 diff.append(f"{'key':9} {'Expected':>8} --- {'Observed':<8}")
284 for key, e_value, t_value in twin_iter(e_row, t_row):
285 if e_value != t_value:
286 if isinstance(e_value, float) and not pd.np.isnan(e_value):
287 diff.append(f"- {key:6}: {e_value:>8} != {t_value:<8}")
289 diff = '\n'.join(diff)
290 error = f"""*** ERROR ***
291 {self.df}
293 row: {i}
294 {diff}
295 """
296 print(error)
297 return False
298 return True
302def iter_df(df, converters=default_converters, nan_error=False):
303 conv = list()
305 # prepare converters
306 if df.index.name:
307 fields = [df.index.name] + list(df.columns)
308 conv = prepare_converters(fields, converters)
309 else:
310 fields = list(df.columns)
311 conv = prepare_converters(fields, converters)
313 for idx, row in df.iterrows():
314 row = [idx] + list(row)
315 row = apply_converters(row, conv, nan_error)
316 yield row
320# --------------------------------------------------------
321# Check internal table structures in the middle of algorithm
322# --------------------------------------------------------
324async def inject_events(df, hub, key, klass, converters=default_converters, rate=10,
325 pub_keys=['{key}', '/test{key}', ]):
326 """Inject events from a df into Hub using a key
327 converting rows values prior building a klass.
329 Allow
330 """
331 ctx = locals()
332 pub_keys = [k.format(**ctx) for k in pub_keys]
334 s = 1 / (rate * len(pub_keys))
335 s = 1 / rate
337 for row in iter_df(df, converters):
338 record = klass(*row)
339 # print(record)
341 for i, k in enumerate(pub_keys):
342 if i > 0:
343 priority = 15
344 else:
345 priority = 1
346 await asyncio.sleep(s)
348 join = asyncio.Event()
349 hub.publish(k, record, priority=priority, join=join)
350 await join.wait()
351 foo = 1
352 foo = 1
355async def inject_events_using_date(df, hub, key, klass, date='date', converters=None, speed=1):
356 """Inject events from a df into Hub using a key
357 converting rows values prior building a klass
358 """
360 now = None
361 for row in iter_df(df, converters):
362 record = klass(*row)
363 when = getattr(record, date)
365 if now:
366 dt = when - now
367 delay = dt.seconds / speed
368 await asyncio.sleep(delay)
370 now = when
371 hub.publish(key, record)
374async def await_until(condition, timeout=10, sampling=10, extra=0):
375 # func = get_calling_function()
376 frame = sys._getframe(1)
377 context = dict(frame.f_locals)
379 t0 = time.time()
380 s = 1 / sampling
381 while time.time() - t0 < timeout:
382 try:
383 r = eval(condition, context)
384 if r:
385 break
386 except:
387 pass
388 await asyncio.sleep(s)
389 else:
390 raise TimeoutError("wait_until() failled")
392 await asyncio.sleep(extra)
396def parse_df_states(states, converters=default_converters):
397 """Parse a dict of internal states in Markdown to check internal status
398 during the algorithm evolution.
399 """
400 df_states = dict()
401 for seq, state in states.items():
402 for df in Markdown_extractor(state, converters):
403 df_states[seq] = df
404 return df_states
408class InternalStatusMonitor(object):
409 def __init__(self, hub, key, expected_status, obj, df_attr='df', seq_attr='seq', env=None):
410 self.hub = hub
411 self.key = f'/test{key}'
412 self.expected_status = expected_status
413 self.obj = obj # where is the df to be checked
414 self.df_attr = df_attr
415 self.seq_attr = seq_attr
417 self.env = env or dict()
418 self.result = None
420 self.hub.subscribe(self.key, self.check_df_status)
422 async def check_df_status(self, key, data):
423 if self.result:
424 return # don't process any check when an error is happend
426 value = getattr(data, self.seq_attr)
427 expected = self.expected_status.get(value)
428 if expected is not None:
429 observed = getattr(self.obj, self.df_attr)
431 m = Match(observed, expected, self.env)
432 r = m.match()
433 if not r:
434 self.result = RuntimeError(f"*** ERROR: internal status differ in {self.seq_attr}: {value}, data: {data}")
435 print(observed)
436 print(expected)
437 raise self.result
438 else:
439 print(f"OK: internal status {value} match")
440 foo = 1 # ok
441 foo = 1
444async def execute_supervision_test(states, events, record_klass, key, instance, converters=default_converters, env=None):
446 app = instance.app
447 hub = app.hub
448 await app.start()
450 # df_states = parse_df_states(states, converters)
452 t0 = time.time()
453 supervisor = InternalStatusMonitor(hub, key, states, instance, env=env)
455 for df in Markdown_extractor(events, env=env):
456 # remove rows with no sequence (NaN)
457 # df = df[df.index > '']
458 # await inject_events(df, hub, key, klass, converters, rate=4)
459 # await inject_events_using_date(df, hub, key, klass, 'date', converters, speed=10)
460 await inject_events(df, hub, key, record_klass, rate=1000)
462 # await await_until('hub._queue.empty()', extra=0.25)
463 elapsed = time.time() - t0
464 print(f"Elapsed: {elapsed}")
465 if supervisor.result:
466 raise supervisor.result
468 await app.stop()
470 foo = 1
472class InjectorTest(object):
473 def __init__(self, app, events, expected, env=None):
474 self.app = app
475 self.events = events
476 self.expected = expected
477 self.env = env
479 # runtime
480 self.result = None
481 self.test_key = None
482 self.checked = None
484 async def run(self, timers=True):
485 """
486 timers = True : timers are silenced but defined in events
487 """
488 app = self.app
489 hub = app.hub
491 self.test_key = f'/test/run/{random.randint(0, 10**6)}'
492 hub.subscribe(f'{self.test_key}/.*', self._check_status)
494 await app.start()
496 if not timers: # avoid install timers but mine
497 hub._task_timers.cancel()
499 pub_keys = ['{key}', f'{self.test_key}/{{stage}}{{key}}']
501 t0 = time.time()
502 for events in Markdown_extractor(self.events, env=self.env):
503 await self.inject_events(events, pub_keys, rate=1000)
505 elapsed = time.time() - t0
506 print(f"Elapsed: {elapsed}")
507 if self.result:
508 raise self.result
510 await app.stop()
511 foo = 1
513 async def inject_events(self, events, pub_keys, rate=10):
514 hub = self.app.hub
515 ctx = locals()
516 s = 1 / rate
517 env = self.env
519 self.checked = asyncio.Event(loop=hub._loop)
520 join = asyncio.Event(loop=hub._loop)
522 for seq, stage, key, message, date in iter_df(events):
523 await asyncio.sleep(s)
524 try:
525 m = _regx_eval.match(message)
526 if m:
527 message = eval(m.group(2), env)
528 else:
529 message = yaml.load(message)
530 except Exception as why:
531 foo = 1
533 for i, k in enumerate(pub_keys):
534 k = k.format(**locals())
535 self.checked.clear()
536 join.clear()
537 hub.publish(k, message, join=join)
538 await join.wait() # wait event has been processed
540 await self.checked.wait() # wait check has been done
541 foo = 1
543 foo = 1
545 async def _check_status(self, key, data):
546 try:
547 if self.result:
548 return # don't process any check when an error is happend
550 env = self.env
551 stage = key.split(self.test_key)[1].split('/')[1]
553 expected = self.expected.get(stage) or {}
554 for exp, status in expected.items():
555 try:
556 observed = eval(exp, env)
557 except Exception as why:
558 foo = 1
560 m = Match(observed, status, env)
561 r = m.match()
563 if not r:
564 self.result = RuntimeError(f"*** ERROR: internal status differ in {self.seq_attr}: {value}, data: {data}")
565 print(observed)
566 print(status)
567 raise self.result
568 else:
569 print(f"Internal status '{stage}' Ok")
570 foo = 1 # ok
571 finally:
572 self.checked.set()
575 # ------------------------------------------------------
576 # helpers
577 # ------------------------------------------------------
578 def _no_timers_subscribe(self, pattern, callback, duplicate=False, single=False):
579 uri_ = parse_uri(pattern)
580 if uri_['scheme'] in ('timer', ):
581 return
582 self.org_subscribe(pattern, callback, duplicate, single)
587# -----------------------------------------------------
588# timeit
589# -----------------------------------------------------
590import timeit
591import os
592from datetime import datetime
593def speed_meter(N=None, label=None, **test,):
594 label = label or '{stmt}'.format(**test)
595 elapsed = timeit.repeat(**test)
596 elapsed.sort()
597 n = test.get('repeat', 5) // 3
598 elapsed = sum(elapsed[:n]) / n
599 if N:
600 speed = N / elapsed
601 else:
602 speed = None
604 test['label'] = label
605 test['speed'] = speed
606 test['elapsed'] = elapsed
607 test['now'] = now = datetime.now()
608 test['now_txt'] = now_txt = now.strftime('%Y-%m-%dT%H:%M:%S')
610 _debug_ = set(sys.executable.split(os.path.sep)).\
611 intersection(set(['wingdb']))
612 _debug_ = len(_debug_) > 0
614 username = os.getenv('USERNAME')
615 with open(f'/tmp/{username}-speed_meter.csv', mode='a') as f:
616 line = f'{now_txt:}, {N}, {elapsed:1.5f}, {speed:e}, {label}, {_debug_}\n'
617 f.write(line)
618 return test