--- title: WARC - Querying WARC Records keywords: fastai sidebar: home_sidebar nb_path: "nbs/02_warc.ipynb" ---
{% raw %}
{% endraw %} {% raw %}
{% endraw %} {% raw %}
from datetime import datetime
{% endraw %} {% raw %}
datetime.strptime('2021-43-0', '%Y-%W-%w')
datetime.datetime(2021, 10, 31, 0, 0)
{% endraw %} {% raw %}

get_warc_metadata[source]

get_warc_metadata(record:ArcWarcRecord)

{% endraw %} {% raw %}

class QueryWARC[source]

QueryWARC(path:Union[str, Path])

{% endraw %} {% raw %}
{% endraw %}

Testing

Generated some test data with:

wget -r -Q1M --domains skeptric.com --warc-file=skeptric --delete-after --no-directories https://skeptric.com/pagination-wayback-cdx/

See warcio library for how to do this in Python.

{% raw %}
test_data = '../resources/test/skeptric.warc.gz'
{% endraw %} {% raw %}
warc = QueryWARC(test_data)
results = list(warc.query())
results
[{'url': 'https://skeptric.com/pagination-wayback-cdx/',
  'timestamp': '2021-11-26T11:28:34Z',
  'mime': 'text/html',
  'status': '200',
  'offset': '889',
  'filename': '../resources/test/skeptric.warc.gz'},
 {'url': 'https://skeptric.com/robots.txt',
  'timestamp': '2021-11-26T11:28:34Z',
  'mime': 'text/html',
  'status': '404',
  'offset': '5804',
  'filename': '../resources/test/skeptric.warc.gz'},
 {'url': 'https://skeptric.com/style.main.min.5ea2f07be7e07e221a7112a3095b89d049b96c48b831f16f1015bf2d95d914e5.css',
  'timestamp': '2021-11-26T11:28:35Z',
  'mime': 'text/css',
  'status': '200',
  'offset': '7197',
  'filename': '../resources/test/skeptric.warc.gz'},
 {'url': 'https://skeptric.com/',
  'timestamp': '2021-11-26T11:28:36Z',
  'mime': 'text/html',
  'status': '200',
  'offset': '17122',
  'filename': '../resources/test/skeptric.warc.gz'},
 {'url': 'https://skeptric.com/about/',
  'timestamp': '2021-11-26T11:28:37Z',
  'mime': 'text/html',
  'status': '200',
  'offset': '125261',
  'filename': '../resources/test/skeptric.warc.gz'},
 {'url': 'https://skeptric.com/tags/data',
  'timestamp': '2021-11-26T11:28:37Z',
  'mime': 'text/html',
  'status': '302',
  'offset': '129093',
  'filename': '../resources/test/skeptric.warc.gz'},
 {'url': 'https://skeptric.com/tags/data/',
  'timestamp': '2021-11-26T11:28:38Z',
  'mime': 'text/html',
  'status': '200',
  'offset': '130269',
  'filename': '../resources/test/skeptric.warc.gz'},
 {'url': 'https://skeptric.com/images/wayback_empty_returns.png',
  'timestamp': '2021-11-26T11:28:38Z',
  'mime': 'image/png',
  'status': '200',
  'offset': '160971',
  'filename': '../resources/test/skeptric.warc.gz'},
 {'url': 'https://skeptric.com/searching-100b-pages-cdx',
  'timestamp': '2021-11-26T11:28:39Z',
  'mime': 'text/html',
  'status': '302',
  'offset': '173368',
  'filename': '../resources/test/skeptric.warc.gz'},
 {'url': 'https://skeptric.com/searching-100b-pages-cdx/',
  'timestamp': '2021-11-26T11:28:39Z',
  'mime': 'text/html',
  'status': '200',
  'offset': '174558',
  'filename': '../resources/test/skeptric.warc.gz'},
 {'url': 'https://skeptric.com/fast-web-data-workflow/',
  'timestamp': '2021-11-26T11:28:39Z',
  'mime': 'text/html',
  'status': '200',
  'offset': '188608',
  'filename': '../resources/test/skeptric.warc.gz'},
 {'url': 'https://skeptric.com/key-web-captures/',
  'timestamp': '2021-11-26T11:28:40Z',
  'mime': 'text/html',
  'status': '200',
  'offset': '195651',
  'filename': '../resources/test/skeptric.warc.gz'},
 {'url': 'https://skeptric.com/emacs-tempfile-hugo/',
  'timestamp': '2021-11-26T11:28:40Z',
  'mime': 'text/html',
  'status': '200',
  'offset': '201243',
  'filename': '../resources/test/skeptric.warc.gz'}]
{% endraw %}

Try fetching a record

{% raw %}
image_record = [r for r in results if r['mime'] == 'image/png'][0]

content = warc.fetch_one(image_record)
from IPython.display import Image
Image(content)
{% endraw %}

Fetching all records

{% raw %}
contents = warc.fetch(results)
len(contents)
13
{% endraw %}