--- title: Process keywords: fastai sidebar: home_sidebar nb_path: "nbs/02_runners.ipynb" ---
%load_ext autoreload
%autoreload 2
In practice we'd use a something like parsel, beautifulsoup or selectolax. However for a simple demo this has no external dependencies.
from html.parser import HTMLParser
class SkeptricHTMLParser(HTMLParser):
def __init__(self):
super().__init__()
self.extract = {}
self.field = None
def handle_starttag(self, tag, attrs):
if dict(attrs).get('class') == 'post-full-title':
self.field = 'title'
if dict(attrs).get('class') == 'byline-meta-date':
self.field = 'date'
def handle_endtag(self, tag):
self.field = None
def handle_data(self, data):
if self.field is not None:
self.extract[self.field] = data
def skeptric_filter(records):
return [r for r in records if r.mime == 'text/html' and r.status == 200]
def skeptric_extract(content, metadata):
parser = SkeptricHTMLParser()
html = content.decode('utf-8')
parser.feed(html)
data = parser.extract
data['url'] = metadata.url
data['timestamp'] = metadata.timestamp
return data
def skeptric_verify_extract(content, metadata):
if not content.get('title'):
raise ValueError('Missing title')
if not content.get('date'):
raise ValueError('Missing date')
return content
from datetime import datetime
def skeptric_normalise(content, metadata):
content = content.copy()
content['date'] = datetime.strptime(content['date'], '%d %B %Y')
return content
from webrefine.query import WarcFileQuery
test_data = '../resources/test/skeptric.warc.gz'
skeptric_query = WarcFileQuery(test_data)
skeptric_process = ProcessMemory(queries=[skeptric_query],
filter=skeptric_filter,
steps=[skeptric_extract, skeptric_verify_extract, skeptric_normalise])
list(skeptric_process.run())
skeptric_process.progress_bar = False
list(skeptric_process.run())
We can always look up an error
Would be nicer if everything was a string so we didn't have to handle the imports...
from webrefine.query import WarcFileRecord
import datetime
from pathlib import PosixPath
WarcFileRecord(url='https://skeptric.com/', timestamp=datetime.datetime(2021, 11, 26, 11, 28, 36), mime='text/html', status=200, path=PosixPath('../resources/test/skeptric.warc.gz'), offset=17122, digest='JJVB3MQERHRZJCHOJNKS5VDOODXPZAV2')