--- title: Process keywords: fastai sidebar: home_sidebar nb_path: "nbs/02_runners.ipynb" ---
{% raw %}
{% endraw %} {% raw %}
%load_ext autoreload
%autoreload 2
{% endraw %} {% raw %}
{% endraw %} {% raw %}

class ProcessMemory[source]

ProcessMemory(queries:list[Callable], steps:list[Callable], filter:Callable, progress_bar:bool=True)

ProcessMemory(queries: 'list[Callable]', steps: 'list[Callable]', filter: 'Callable', progress_bar: 'bool' = True)

{% endraw %} {% raw %}
{% endraw %}

Simple test process

In practice we'd use a something like parsel, beautifulsoup or selectolax. However for a simple demo this has no external dependencies.

{% raw %}
from html.parser import HTMLParser

class SkeptricHTMLParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.extract = {}
        self.field = None
        
    def handle_starttag(self, tag, attrs):
        if dict(attrs).get('class') == 'post-full-title':
            self.field = 'title'
        if dict(attrs).get('class') == 'byline-meta-date':
            self.field = 'date'

    def handle_endtag(self, tag):
        self.field = None

    def handle_data(self, data):
        if self.field is not None:
            self.extract[self.field] = data

def skeptric_filter(records):
    return [r for r in records if r.mime == 'text/html' and r.status == 200]
            
def skeptric_extract(content, metadata):
    parser = SkeptricHTMLParser()
    html = content.decode('utf-8')
    parser.feed(html)
    data = parser.extract
    data['url'] = metadata.url
    data['timestamp'] = metadata.timestamp
    return data

def skeptric_verify_extract(content, metadata):
    if not content.get('title'):
        raise ValueError('Missing title')
    if not content.get('date'):
        raise ValueError('Missing date')
    return content

from datetime import datetime
def skeptric_normalise(content, metadata):
    content = content.copy()
    content['date'] = datetime.strptime(content['date'], '%d %B %Y')
    return content

from webrefine.query import WarcFileQuery
test_data = '../resources/test/skeptric.warc.gz'

skeptric_query = WarcFileQuery(test_data)
{% endraw %} {% raw %}
skeptric_process = ProcessMemory(queries=[skeptric_query],
                     filter=skeptric_filter,
                     steps=[skeptric_extract, skeptric_verify_extract, skeptric_normalise])
{% endraw %} {% raw %}
list(skeptric_process.run())
ERROR:root:Error processing WarcFileRecord(url='https://skeptric.com/', timestamp=datetime.datetime(2021, 11, 26, 11, 28, 36), mime='text/html', status=200, path=PosixPath('../resources/test/skeptric.warc.gz'), offset=17122, digest='JJVB3MQERHRZJCHOJNKS5VDOODXPZAV2') at step skeptric_verify_extract: Missing title
ERROR:root:Error processing WarcFileRecord(url='https://skeptric.com/tags/data/', timestamp=datetime.datetime(2021, 11, 26, 11, 28, 38), mime='text/html', status=200, path=PosixPath('../resources/test/skeptric.warc.gz'), offset=130269, digest='R7CLAACFU5L7T5LKI5G53RZSMCNUNV6F') at step skeptric_verify_extract: Missing title
[{'title': "Pagination in Internet Archive's Wayback Machine with CDX",
  'date': datetime.datetime(2021, 11, 23, 0, 0),
  'url': 'https://skeptric.com/pagination-wayback-cdx/',
  'timestamp': datetime.datetime(2021, 11, 26, 11, 28, 34)},
 {'title': 'About Skeptric',
  'date': datetime.datetime(2021, 10, 18, 0, 0),
  'url': 'https://skeptric.com/about/',
  'timestamp': datetime.datetime(2021, 11, 26, 11, 28, 37)},
 {'title': 'Searching 100 Billion Webpages Pages With Capture Index',
  'date': datetime.datetime(2020, 6, 11, 0, 0),
  'url': 'https://skeptric.com/searching-100b-pages-cdx/',
  'timestamp': datetime.datetime(2021, 11, 26, 11, 28, 39)},
 {'title': 'Fast Web Dataset Extraction Worfklow',
  'date': datetime.datetime(2021, 11, 21, 0, 0),
  'url': 'https://skeptric.com/fast-web-data-workflow/',
  'timestamp': datetime.datetime(2021, 11, 26, 11, 28, 39)},
 {'title': 'Unique Key for Web Captures',
  'date': datetime.datetime(2021, 11, 19, 0, 0),
  'url': 'https://skeptric.com/key-web-captures/',
  'timestamp': datetime.datetime(2021, 11, 26, 11, 28, 40)},
 {'title': 'Hugo Readdir Error with Emacs',
  'date': datetime.datetime(2021, 11, 22, 0, 0),
  'url': 'https://skeptric.com/emacs-tempfile-hugo/',
  'timestamp': datetime.datetime(2021, 11, 26, 11, 28, 40)}]
{% endraw %} {% raw %}
skeptric_process.progress_bar = False
{% endraw %} {% raw %}
list(skeptric_process.run())
ERROR:root:Error processing WarcFileRecord(url='https://skeptric.com/', timestamp=datetime.datetime(2021, 11, 26, 11, 28, 36), mime='text/html', status=200, path=PosixPath('../resources/test/skeptric.warc.gz'), offset=17122, digest='JJVB3MQERHRZJCHOJNKS5VDOODXPZAV2') at step skeptric_verify_extract: Missing title
ERROR:root:Error processing WarcFileRecord(url='https://skeptric.com/tags/data/', timestamp=datetime.datetime(2021, 11, 26, 11, 28, 38), mime='text/html', status=200, path=PosixPath('../resources/test/skeptric.warc.gz'), offset=130269, digest='R7CLAACFU5L7T5LKI5G53RZSMCNUNV6F') at step skeptric_verify_extract: Missing title
[{'title': "Pagination in Internet Archive's Wayback Machine with CDX",
  'date': datetime.datetime(2021, 11, 23, 0, 0),
  'url': 'https://skeptric.com/pagination-wayback-cdx/',
  'timestamp': datetime.datetime(2021, 11, 26, 11, 28, 34)},
 {'title': 'About Skeptric',
  'date': datetime.datetime(2021, 10, 18, 0, 0),
  'url': 'https://skeptric.com/about/',
  'timestamp': datetime.datetime(2021, 11, 26, 11, 28, 37)},
 {'title': 'Searching 100 Billion Webpages Pages With Capture Index',
  'date': datetime.datetime(2020, 6, 11, 0, 0),
  'url': 'https://skeptric.com/searching-100b-pages-cdx/',
  'timestamp': datetime.datetime(2021, 11, 26, 11, 28, 39)},
 {'title': 'Fast Web Dataset Extraction Worfklow',
  'date': datetime.datetime(2021, 11, 21, 0, 0),
  'url': 'https://skeptric.com/fast-web-data-workflow/',
  'timestamp': datetime.datetime(2021, 11, 26, 11, 28, 39)},
 {'title': 'Unique Key for Web Captures',
  'date': datetime.datetime(2021, 11, 19, 0, 0),
  'url': 'https://skeptric.com/key-web-captures/',
  'timestamp': datetime.datetime(2021, 11, 26, 11, 28, 40)},
 {'title': 'Hugo Readdir Error with Emacs',
  'date': datetime.datetime(2021, 11, 22, 0, 0),
  'url': 'https://skeptric.com/emacs-tempfile-hugo/',
  'timestamp': datetime.datetime(2021, 11, 26, 11, 28, 40)}]
{% endraw %}

We can always look up an error

Would be nicer if everything was a string so we didn't have to handle the imports...

{% raw %}
from webrefine.query import WarcFileRecord
import datetime
from pathlib import PosixPath
WarcFileRecord(url='https://skeptric.com/', timestamp=datetime.datetime(2021, 11, 26, 11, 28, 36), mime='text/html', status=200, path=PosixPath('../resources/test/skeptric.warc.gz'), offset=17122, digest='JJVB3MQERHRZJCHOJNKS5VDOODXPZAV2')
WarcFileRecord(url='https://skeptric.com/', timestamp=datetime.datetime(2021, 11, 26, 11, 28, 36), mime='text/html', status=200, path=PosixPath('../resources/test/skeptric.warc.gz'), offset=17122, digest='JJVB3MQERHRZJCHOJNKS5VDOODXPZAV2')
{% endraw %}