Benchmark similarity checks

Remember to compile cnnclustering with TRACE_CYTHON=0 if timings should be measured. TRACE_CYTHON=1 is required for line profiling and test coverage and will cause substantial overhead.

Notebook to organise benchmarks of different implementations of the density-criterion (similarity) check. In general, two neighbourhood containers are tested to have at least a certain number of common elements.

Table of Contents

  • 1  Pre-requirements

  • 2  Version info

  • 3  Helper function definitions

    • 3.1  Plots

  • 4  Runs

    • 4.1  CommonNN similarity check

      • 4.1.1  Run a a

      • 4.1.2  Run a b

      • 4.1.3  Run a c

      • 4.1.4  Run b a

      • 4.1.5  Run b b

      • 4.1.6  Run b c

      • 4.1.7  Run c a

      • 4.1.8  Run c b

    • 4.2  Scaling

    • 4.3  Plots

    • 4.4  Run a a

    • 4.5  Run a b

    • 4.6  Run a c

    • 4.7  Run b a

    • 4.8  Compare switch/no switch

    • 4.9  Compare worst case

Pre-requirements

[1]:
from datetime import datetime

import cnnclustering
from cnnclustering import cluster
from cnnclustering import _fit, _primitive_types, _types
from IPython.core.magics.execution import TimeitResult
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from scipy.optimize import curve_fit
from tqdm.notebook import tqdm

import helper
[2]:
import importlib
[3]:
importlib.reload(helper)
[3]:
<module 'helper' from '/home/janjoswig/repo/CommonNNClustering/docsrc/benchmark/helper.py'>
[4]:
mpl.rcParams["figure.dpi"] = 300

Version info

[5]:
print(f"{'Python':>20} :  ", *sys.version.splitlines())

modules = [
]

for alias, m in modules:
    try:
        print(f"{alias:>20} :  ", m.__version__)
    except AttributeError:
        print(f"{alias:>20} :  ", "no version info")
              Python :   3.8.8 (default, Mar 11 2021, 08:58:19)  [GCC 8.3.0]
[6]:
print(f"cnnclustering :  ", cnnclustering.__version__)
cnnclustering :   0.4.2

Helper function definitions

[7]:
rng = np.random.default_rng(2021)
[8]:
# Case generation
def gen_range(start, stop, step):
    yield from range(start, stop, step)

def gen_shuffled(start, stop, step):
    from_range = np.arange(start, stop, step)
    yield from rng.choice(from_range, size=((stop - start) // step), replace=False)
[9]:
member_counts = [200 * 5**x for x in range(6)]
member_counts
[9]:
[200, 1000, 5000, 25000, 125000, 625000]
[10]:
CASES_MAP = {}

case_name = "equal_c50"
CASES_MAP[case_name] = []
for count in member_counts:
    CASES_MAP[case_name].append(
        (str(count), gen_range, (0, count, 1), gen_range, (0, count, 1), count // 2)
        )

case_name = "mixed_c50"
CASES_MAP[case_name] = []
for count in member_counts:
    CASES_MAP[case_name].append(
        (str(count), gen_shuffled, (0, count, 1), gen_shuffled, (0, count, 2), count // 4)
        )

case_name = "shuffled_c50"
CASES_MAP[case_name] = []
for count in member_counts:
    CASES_MAP[case_name].append(
        (str(count), gen_shuffled, (0, count, 1), gen_shuffled, (0, count, 1), count // 2)
        )

case_name = "mixed_shuffled_c50"
CASES_MAP[case_name] = []
for count in member_counts:
    CASES_MAP[case_name].append(
        (str(count), gen_shuffled, (0, count, 1), gen_shuffled, (0, count, 2), count // 4)
        )

case_name = "equal_c33"
CASES_MAP[case_name] = []
for count in member_counts:
    CASES_MAP[case_name].append(
        (str(count), gen_range, (0, count, 1), gen_range, (0, count, 1), count // 3)
        )

case_name = "mixed_c33"
CASES_MAP[case_name] = []
for count in member_counts:
    CASES_MAP[case_name].append(
        (str(count), gen_shuffled, (0, count, 1), gen_shuffled, (0, count, 2), count // 6)
        )

case_name = "shuffled_c33"
CASES_MAP[case_name] = []
for count in member_counts:
    CASES_MAP[case_name].append(
        (str(count), gen_shuffled, (0, count, 1), gen_shuffled, (0, count, 1), count // 3)
        )

case_name = "mixed_shuffled_c33"
CASES_MAP[case_name] = []
for count in member_counts:
    CASES_MAP[case_name].append(
        (str(count), gen_shuffled, (0, count, 1), gen_shuffled, (0, count, 2), count // 6)
        )

case_name = "no_check"
CASES_MAP[case_name] = []
for count in member_counts:
    CASES_MAP[case_name].append(
        (str(count), gen_range, (0, count, 1), gen_range, (0, count, 1), 0)
        )

case_name = "diff"
CASES_MAP[case_name] = []
for count in member_counts:
    CASES_MAP[case_name].append(
        (str(count), gen_range, (0, count, 1), gen_range, (count, 2*count, 1), 1)
        )

case_name = "equal_p625000"
CASES_MAP[case_name] = []
for count in member_counts:
    CASES_MAP[case_name].append(
        (str(count), gen_range, (0, 625000, 1), gen_range, (0, 625000, 1), count)
        )
[11]:
CASES_MAP
[11]:
{'equal_c50': [('200',
   <function __main__.gen_range(start, stop, step)>,
   (0, 200, 1),
   <function __main__.gen_range(start, stop, step)>,
   (0, 200, 1),
   100),
  ('1000',
   <function __main__.gen_range(start, stop, step)>,
   (0, 1000, 1),
   <function __main__.gen_range(start, stop, step)>,
   (0, 1000, 1),
   500),
  ('5000',
   <function __main__.gen_range(start, stop, step)>,
   (0, 5000, 1),
   <function __main__.gen_range(start, stop, step)>,
   (0, 5000, 1),
   2500),
  ('25000',
   <function __main__.gen_range(start, stop, step)>,
   (0, 25000, 1),
   <function __main__.gen_range(start, stop, step)>,
   (0, 25000, 1),
   12500),
  ('125000',
   <function __main__.gen_range(start, stop, step)>,
   (0, 125000, 1),
   <function __main__.gen_range(start, stop, step)>,
   (0, 125000, 1),
   62500),
  ('625000',
   <function __main__.gen_range(start, stop, step)>,
   (0, 625000, 1),
   <function __main__.gen_range(start, stop, step)>,
   (0, 625000, 1),
   312500)],
 'mixed_c50': [('200',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 200, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 200, 2),
   50),
  ('1000',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 1000, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 1000, 2),
   250),
  ('5000',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 5000, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 5000, 2),
   1250),
  ('25000',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 25000, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 25000, 2),
   6250),
  ('125000',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 125000, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 125000, 2),
   31250),
  ('625000',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 625000, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 625000, 2),
   156250)],
 'shuffled_c50': [('200',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 200, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 200, 1),
   100),
  ('1000',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 1000, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 1000, 1),
   500),
  ('5000',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 5000, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 5000, 1),
   2500),
  ('25000',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 25000, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 25000, 1),
   12500),
  ('125000',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 125000, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 125000, 1),
   62500),
  ('625000',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 625000, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 625000, 1),
   312500)],
 'mixed_shuffled_c50': [('200',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 200, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 200, 2),
   50),
  ('1000',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 1000, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 1000, 2),
   250),
  ('5000',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 5000, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 5000, 2),
   1250),
  ('25000',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 25000, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 25000, 2),
   6250),
  ('125000',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 125000, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 125000, 2),
   31250),
  ('625000',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 625000, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 625000, 2),
   156250)],
 'equal_c33': [('200',
   <function __main__.gen_range(start, stop, step)>,
   (0, 200, 1),
   <function __main__.gen_range(start, stop, step)>,
   (0, 200, 1),
   66),
  ('1000',
   <function __main__.gen_range(start, stop, step)>,
   (0, 1000, 1),
   <function __main__.gen_range(start, stop, step)>,
   (0, 1000, 1),
   333),
  ('5000',
   <function __main__.gen_range(start, stop, step)>,
   (0, 5000, 1),
   <function __main__.gen_range(start, stop, step)>,
   (0, 5000, 1),
   1666),
  ('25000',
   <function __main__.gen_range(start, stop, step)>,
   (0, 25000, 1),
   <function __main__.gen_range(start, stop, step)>,
   (0, 25000, 1),
   8333),
  ('125000',
   <function __main__.gen_range(start, stop, step)>,
   (0, 125000, 1),
   <function __main__.gen_range(start, stop, step)>,
   (0, 125000, 1),
   41666),
  ('625000',
   <function __main__.gen_range(start, stop, step)>,
   (0, 625000, 1),
   <function __main__.gen_range(start, stop, step)>,
   (0, 625000, 1),
   208333)],
 'mixed_c33': [('200',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 200, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 200, 2),
   33),
  ('1000',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 1000, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 1000, 2),
   166),
  ('5000',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 5000, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 5000, 2),
   833),
  ('25000',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 25000, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 25000, 2),
   4166),
  ('125000',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 125000, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 125000, 2),
   20833),
  ('625000',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 625000, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 625000, 2),
   104166)],
 'shuffled_c33': [('200',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 200, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 200, 1),
   66),
  ('1000',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 1000, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 1000, 1),
   333),
  ('5000',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 5000, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 5000, 1),
   1666),
  ('25000',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 25000, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 25000, 1),
   8333),
  ('125000',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 125000, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 125000, 1),
   41666),
  ('625000',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 625000, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 625000, 1),
   208333)],
 'mixed_shuffled_c33': [('200',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 200, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 200, 2),
   33),
  ('1000',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 1000, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 1000, 2),
   166),
  ('5000',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 5000, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 5000, 2),
   833),
  ('25000',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 25000, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 25000, 2),
   4166),
  ('125000',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 125000, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 125000, 2),
   20833),
  ('625000',
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 625000, 1),
   <function __main__.gen_shuffled(start, stop, step)>,
   (0, 625000, 2),
   104166)],
 'no_check': [('200',
   <function __main__.gen_range(start, stop, step)>,
   (0, 200, 1),
   <function __main__.gen_range(start, stop, step)>,
   (0, 200, 1),
   0),
  ('1000',
   <function __main__.gen_range(start, stop, step)>,
   (0, 1000, 1),
   <function __main__.gen_range(start, stop, step)>,
   (0, 1000, 1),
   0),
  ('5000',
   <function __main__.gen_range(start, stop, step)>,
   (0, 5000, 1),
   <function __main__.gen_range(start, stop, step)>,
   (0, 5000, 1),
   0),
  ('25000',
   <function __main__.gen_range(start, stop, step)>,
   (0, 25000, 1),
   <function __main__.gen_range(start, stop, step)>,
   (0, 25000, 1),
   0),
  ('125000',
   <function __main__.gen_range(start, stop, step)>,
   (0, 125000, 1),
   <function __main__.gen_range(start, stop, step)>,
   (0, 125000, 1),
   0),
  ('625000',
   <function __main__.gen_range(start, stop, step)>,
   (0, 625000, 1),
   <function __main__.gen_range(start, stop, step)>,
   (0, 625000, 1),
   0)],
 'diff': [('200',
   <function __main__.gen_range(start, stop, step)>,
   (0, 200, 1),
   <function __main__.gen_range(start, stop, step)>,
   (200, 400, 1),
   1),
  ('1000',
   <function __main__.gen_range(start, stop, step)>,
   (0, 1000, 1),
   <function __main__.gen_range(start, stop, step)>,
   (1000, 2000, 1),
   1),
  ('5000',
   <function __main__.gen_range(start, stop, step)>,
   (0, 5000, 1),
   <function __main__.gen_range(start, stop, step)>,
   (5000, 10000, 1),
   1),
  ('25000',
   <function __main__.gen_range(start, stop, step)>,
   (0, 25000, 1),
   <function __main__.gen_range(start, stop, step)>,
   (25000, 50000, 1),
   1),
  ('125000',
   <function __main__.gen_range(start, stop, step)>,
   (0, 125000, 1),
   <function __main__.gen_range(start, stop, step)>,
   (125000, 250000, 1),
   1),
  ('625000',
   <function __main__.gen_range(start, stop, step)>,
   (0, 625000, 1),
   <function __main__.gen_range(start, stop, step)>,
   (625000, 1250000, 1),
   1)],
 'equal_p625000': [('200',
   <function __main__.gen_range(start, stop, step)>,
   (0, 625000, 1),
   <function __main__.gen_range(start, stop, step)>,
   (0, 625000, 1),
   200),
  ('1000',
   <function __main__.gen_range(start, stop, step)>,
   (0, 625000, 1),
   <function __main__.gen_range(start, stop, step)>,
   (0, 625000, 1),
   1000),
  ('5000',
   <function __main__.gen_range(start, stop, step)>,
   (0, 625000, 1),
   <function __main__.gen_range(start, stop, step)>,
   (0, 625000, 1),
   5000),
  ('25000',
   <function __main__.gen_range(start, stop, step)>,
   (0, 625000, 1),
   <function __main__.gen_range(start, stop, step)>,
   (0, 625000, 1),
   25000),
  ('125000',
   <function __main__.gen_range(start, stop, step)>,
   (0, 625000, 1),
   <function __main__.gen_range(start, stop, step)>,
   (0, 625000, 1),
   125000),
  ('625000',
   <function __main__.gen_range(start, stop, step)>,
   (0, 625000, 1),
   <function __main__.gen_range(start, stop, step)>,
   (0, 625000, 1),
   625000)]}
[12]:
def collect_timings(
        checker, na, nb,
        cases, timings=None):
    """Orchestrate timings

    Args:


    Keyword args:
        timings: An optional timings mapping which results should be
            put into.

    Returns:
        timings mapping
    """

    # Timed function has to be in global namespace to be discovered by %timeit magic
    global timed_args
    global timed_kwargs
    global timed_func

    if timings is None:
        timings = {}

    progress = tqdm(cases, desc="Run completed")

    for run_index, (run_id, gen_a, args_a, gen_b, args_b, c) in enumerate(progress):

        na.reset()
        for index in gen_a(*args_a):
            na.assign(index)

        nb.reset()
        for index in gen_b(*args_b):
            nb.assign(index)

        cluster_params = _types.ClusterParameters(0, c)

        timings[run_id] = %timeit -q -o checker.check(na, nb, cluster_params)

        tqdm.write(f"Timed run ID: {run_id:>10}    ({datetime.now().strftime('%d.%m.%Y %H:%M:%S')})")

    return timings

Plots

[13]:
def evaluate_timings(
        RUN_TIMINGS_MAP, RUN_SCALING_MAP, run_name_list,
        normal_ax=None, log_ax=None,
        plot_props=None, legend_props=None):

    if (normal_ax is None) & (log_ax is None):
        fig, (normal_ax, log_ax) = plt.subplots(2, 1)
    elif normal_ax is not None:
        fig = normal_ax.get_figure()
    else:
        fig = log_ax.get_figure()

    if plot_props is None:
        plot_props = {}

    default_legend_props = {
        "fancybox": False,
        "framealpha": 1,
        "edgecolor": "k",
        "fontsize": "xx-small",
        }

    if legend_props is not None:
        default_legend_props.update(legend_props)

    fit_lines = []
    fit_legend_labels = []

    normal_lines = []
    log_lines = []

    for name, label in run_name_list:

        (newx, fity), b = RUN_SCALING_MAP[name]

        if normal_ax is not None:
            n_line, = helper.plot_timings(
                RUN_TIMINGS_MAP[name],
                ax=normal_ax,
                id_to_x=lambda x: int(x),
                sort_ids=True,
                set_ax_props=False,
                plot_props=plot_props
            )
            normal_lines.append(n_line)
            f_line, = normal_ax.plot(
                newx, fity,
                linestyle="--",
                color=n_line.get_color(),
                marker="None",
                zorder=0
            )
            fit_legend_labels.append(f'{label} ($b = {b[0]:.1f} \pm {b[1]:.1f}$)')
            fit_lines.append(f_line)

        if log_ax is not None:
            l_line, = helper.plot_timings(
                RUN_TIMINGS_MAP[name],
                ax=log_ax,
                id_to_x=lambda x: int(x),
                sort_ids=True,
                set_ax_props=False,
                plot_props=plot_props
            )
            log_lines.append(l_line)
            fl_line, = log_ax.plot(
                newx, fity,
                linestyle="--",
                color=l_line.get_color(),
                marker="None",
                zorder=0
            )

            if normal_ax is None:
                fit_legend_labels.append(f'{label} ($b = {b[0]:.1f} \pm {b[1]:.1f}$)')
                fit_lines.append(fl_line)

    if normal_ax is not None:
        normal_ax.set(**{
            "xlabel": None,
            "ylabel": None,
            "ylim": (0, None)
        })

        if log_ax is not None:
            normal_ax.xaxis.tick_top()

    if log_ax is not None:
        log_ax.set(**{
            "xlabel": None,
            "ylabel": None,
            "xscale": "log",
            "yscale": "log",
            "ylim": (1e-6, None)
        })

    if normal_ax is not None:
        legend_ax = normal_ax
        legend_lines = normal_lines
    else:
        legend_ax = log_ax
        legend_lines = log_lines

    legend = legend_ax.legend(
        legend_lines,
        fit_legend_labels,
        **default_legend_props
        )
    legend.get_frame().set_linewidth(0.5)

    return fig
[14]:
marker_list = [
    "o", "v", "^", "s", "p", "P", "*", "h",
    "d", "H", "D", ">", "<", "8", "X", "o"

]
color_list = [
    '396ab1', 'da7c30', '3e9651', 'cc2529', '535154', '6b4c9a', '922428', '948b3d',
    '7293cb', 'e1974c', '84ba5b', 'd35e60', '9067a7', 'ab6857', 'ccc210', '808585'
]

Runs

[15]:
if "RUN_TIMINGS_MAP" not in dir():
    RUN_TIMINGS_MAP = {}
[16]:
report_dir = pathlib.Path("reports/qcm07/cnnclustering_similarity_check")
[17]:
case_list = CASES_MAP.keys()

run_list = [
    "run_a_a",
    "run_a_b",
    "run_a_c",
    "run_b_a",
    "run_b_c",
    "run_c_a",
    "run_c_b"
]

for run_name in run_list:
    for case_name in case_list:
        full_run_name = f"{case_name}_{run_name}"
        report_file = report_dir / f"{full_run_name}.json"
        try:
            RUN_TIMINGS_MAP[full_run_name] = helper.load_report(report_file)
        except FileNotFoundError:
            pass

CommonNN similarity check

Checking of similarity criterion for fixed cases of neighbour lists:

  • SimilarityCheckerExtContains

    • NeighboursExtVector run a_a

    • NeighboursExtCPPUnorderedSet run a_b

    • NeighboursExtVectorCPPUnorderedSet run a_c

  • SimilarityCheckerExtSwitchContains

    • NeighboursExtVector run b_a

    • NeighboursExtCPPUnorderedSet run b_b

    • NeighboursExtVectorCPPUnorderedSet run b_c

  • SimilarityCheckerExtScreensorted

    • NeighboursExtVector run c_a

    • NeighboursExtVectorCPPUnorderedSet run c_b

#### Run a a

[18]:
case_list = CASES_MAP.keys()
case_list
[18]:
dict_keys(['equal_c50', 'mixed_c50', 'shuffled_c50', 'mixed_shuffled_c50', 'equal_c33', 'mixed_c33', 'shuffled_c33', 'mixed_shuffled_c33', 'no_check', 'diff', 'equal_p625000'])
[18]:
case_list = [
    # 'equal_c50',
    # 'mixed_c50',
    # 'shuffled_c50',
    # 'mixed_shuffled_c50',
    # 'equal_c33',
    # 'mixed_c33',
    # 'shuffled_c33',
    # 'mixed_shuffled_c33',
    # 'no_check',
    'diff',
    'equal_p625000'
]
[19]:
checker = _types.SimilarityCheckerExtContains()
na = _types.NeighboursExtVector(1)
nb = _types.NeighboursExtVector(1)
[ ]:
run_name = "run_a_a"
for case_name in case_list:
    full_run_name = f"{case_name}_{run_name}"
    report_file = report_dir / f"{full_run_name}.json"

    print(f"Collection for run: {full_run_name}")
    RUN_TIMINGS_MAP[full_run_name] = {}
    collect_timings(
        checker,
        na,
        nb,
        CASES_MAP[case_name],
        timings=RUN_TIMINGS_MAP[full_run_name]
    )

    helper.save_report(RUN_TIMINGS_MAP[full_run_name], report_file, overwrite=True)

    helper.print_ratios(helper.get_ratios(RUN_TIMINGS_MAP[full_run_name]))

    print()
Collection for run: diff_run_a_a
Timed run ID:        200    (18.06.2021 21:56:20)
Timed run ID:       1000    (18.06.2021 21:56:22)

#### Run a b

[ ]:
case_list = [
    'equal_c50',
    'mixed_c50',
    'shuffled_c50',
    'mixed_shuffled_c50',
    'equal_c33',
    'mixed_c33',
    'shuffled_c33',
    'mixed_shuffled_c33',
    'no_check',
    'diff',
    'equal_p625000'
]
[ ]:
checker = _types.SimilarityCheckerExtContains()
na = _types.NeighboursExtCPPUnorderedSet()
nb = _types.NeighboursExtCPPUnorderedSet()
[ ]:
run_name = "run_a_b"
for case_name in case_list:
    full_run_name = f"{case_name}_{run_name}"
    report_file = report_dir / f"{full_run_name}.json"

    print(f"Collection for run: {full_run_name}")
    RUN_TIMINGS_MAP[full_run_name] = {}
    collect_timings(
        checker,
        na,
        nb,
        CASES_MAP[case_name],
        timings=RUN_TIMINGS_MAP[full_run_name]
    )

    helper.save_report(RUN_TIMINGS_MAP[full_run_name], report_file, overwrite=True)

    helper.print_ratios(helper.get_ratios(RUN_TIMINGS_MAP[full_run_name]))

    print()

#### Run a c

[ ]:
case_list = [
    'equal_c50',
    'mixed_c50',
    'shuffled_c50',
    'mixed_shuffled_c50',
    'equal_c33',
    'mixed_c33',
    'shuffled_c33',
    'mixed_shuffled_c33',
    'no_check',
    'diff',
    'equal_p625000'
]
[ ]:
checker = _types.SimilarityCheckerExtContains()
na = _types.NeighboursExtVectorCPPUnorderedSet(1)
nb = _types.NeighboursExtVectorCPPUnorderedSet(1)
[ ]:
run_name = "run_a_c"
for case_name in case_list:
    full_run_name = f"{case_name}_{run_name}"
    report_file = report_dir / f"{full_run_name}.json"

    print(f"Collection for run: {full_run_name}")
    RUN_TIMINGS_MAP[full_run_name] = {}
    collect_timings(
        checker,
        na,
        nb,
        CASES_MAP[case_name],
        timings=RUN_TIMINGS_MAP[full_run_name]
    )

    helper.save_report(RUN_TIMINGS_MAP[full_run_name], report_file, overwrite=True)

    helper.print_ratios(helper.get_ratios(RUN_TIMINGS_MAP[full_run_name]))

    print()

#### Run b a

[ ]:
case_list = [
    'equal_c50',
    'mixed_c50',
    'shuffled_c50',
    'mixed_shuffled_c50',
    'equal_c33',
    'mixed_c33',
    'shuffled_c33',
    'mixed_shuffled_c33',
    'no_check',
    'diff',
    'equal_p625000'
]
[ ]:
checker = _types.SimilarityCheckerExtSwitchContains()
na = _types.NeighboursExtVector(1)
nb = _types.NeighboursExtVector(1)
[ ]:
run_name = "run_b_a"
for case_name in case_list:
    full_run_name = f"{case_name}_{run_name}"
    report_file = report_dir / f"{full_run_name}.json"

    print(f"Collection for run: {full_run_name}")
    RUN_TIMINGS_MAP[full_run_name] = {}
    collect_timings(
        checker,
        na,
        nb,
        CASES_MAP[case_name],
        timings=RUN_TIMINGS_MAP[full_run_name]
    )

    helper.save_report(RUN_TIMINGS_MAP[full_run_name], report_file, overwrite=True)

    helper.print_ratios(helper.get_ratios(RUN_TIMINGS_MAP[full_run_name]))

    print()

#### Run b b

[ ]:
case_list = [
    'equal_c50',
    'mixed_c50',
    'shuffled_c50',
    'mixed_shuffled_c50',
    'equal_c33',
    'mixed_c33',
    'shuffled_c33',
    'mixed_shuffled_c33',
    'no_check',
    'diff',
    'equal_p625000'
]
[ ]:
checker = _types.SimilarityCheckerExtSwitchContains()
na = _types.NeighboursExtCPPUnorderedSet()
nb = _types.NeighboursExtCPPUnorderedSet()
[ ]:
run_name = "run_b_b"
for case_name in case_list:
    full_run_name = f"{case_name}_{run_name}"
    report_file = report_dir / f"{full_run_name}.json"

    print(f"Collection for run: {full_run_name}")
    RUN_TIMINGS_MAP[full_run_name] = {}
    collect_timings(
        checker,
        na,
        nb,
        CASES_MAP[case_name],
        timings=RUN_TIMINGS_MAP[full_run_name]
    )

    helper.save_report(RUN_TIMINGS_MAP[full_run_name], report_file, overwrite=True)

    helper.print_ratios(helper.get_ratios(RUN_TIMINGS_MAP[full_run_name]))

    print()

#### Run b c

[ ]:
case_list = [
    'equal_c50',
    'mixed_c50',
    'shuffled_c50',
    'mixed_shuffled_c50',
    'equal_c33',
    'mixed_c33',
    'shuffled_c33',
    'mixed_shuffled_c33',
    'no_check',
    'diff',
    'equal_p625000'
]
[ ]:
checker = _types.SimilarityCheckerExtSwitchContains()
na = _types.NeighboursExtVectorCPPUnorderedSet(1)
nb = _types.NeighboursExtVectorCPPUnorderedSet(1)
[ ]:
run_name = "run_b_c"
for case_name in case_list:
    full_run_name = f"{case_name}_{run_name}"
    report_file = report_dir / f"{full_run_name}.json"

    print(f"Collection for run: {full_run_name}")
    RUN_TIMINGS_MAP[full_run_name] = {}
    collect_timings(
        checker,
        na,
        nb,
        CASES_MAP[case_name],
        timings=RUN_TIMINGS_MAP[full_run_name]
    )

    helper.save_report(RUN_TIMINGS_MAP[full_run_name], report_file, overwrite=True)

    helper.print_ratios(helper.get_ratios(RUN_TIMINGS_MAP[full_run_name]))

    print()

#### Run c a

[18]:
case_list = [
    'equal_c50',
    'mixed_c50',
    'equal_c33',
    'mixed_c33',
    'no_check',
    'diff',
    'equal_p625000'
]
[19]:
checker = _types.SimilarityCheckerExtScreensorted()
na = _types.NeighboursExtVector(1)
nb = _types.NeighboursExtVector(1)
[20]:
run_name = "run_c_a"
for case_name in case_list:
    full_run_name = f"{case_name}_{run_name}"
    report_file = report_dir / f"{full_run_name}.json"

    print(f"Collection for run: {full_run_name}")
    RUN_TIMINGS_MAP[full_run_name] = {}
    collect_timings(
        checker,
        na,
        nb,
        CASES_MAP[case_name],
        timings=RUN_TIMINGS_MAP[full_run_name]
    )

    helper.save_report(RUN_TIMINGS_MAP[full_run_name], report_file, overwrite=True)

    helper.print_ratios(helper.get_ratios(RUN_TIMINGS_MAP[full_run_name]))

    print()
Collection for run: equal_c50_run_c_a
Timed run ID:        200    (21.06.2021 21:28:48)
Timed run ID:       1000    (21.06.2021 21:28:51)
Timed run ID:       5000    (21.06.2021 21:28:55)
Timed run ID:      25000    (21.06.2021 21:29:08)
Timed run ID:     125000    (21.06.2021 21:29:13)
Timed run ID:     625000    (21.06.2021 21:29:16)
    Run ID: Factor
=======================
       200: 1.00
      1000: 1.16
      5000: 1.99
     25000: 6.30
    125000: 26.35
    625000: 128.20

Collection for run: mixed_c50_run_c_a
Timed run ID:        200    (21.06.2021 21:29:18)
Timed run ID:       1000    (21.06.2021 21:29:20)
Timed run ID:       5000    (21.06.2021 21:29:24)
Timed run ID:      25000    (21.06.2021 21:29:36)
Timed run ID:     125000    (21.06.2021 21:29:40)
Timed run ID:     625000    (21.06.2021 21:29:42)
    Run ID: Factor
=======================
       200: 1.00
      1000: 1.17
      5000: 2.06
     25000: 5.72
    125000: 22.13
    625000: 91.25

Collection for run: equal_c33_run_c_a
Timed run ID:        200    (21.06.2021 21:29:45)
Timed run ID:       1000    (21.06.2021 21:29:47)
Timed run ID:       5000    (21.06.2021 21:29:50)
Timed run ID:      25000    (21.06.2021 21:29:59)
Timed run ID:     125000    (21.06.2021 21:30:03)
Timed run ID:     625000    (21.06.2021 21:30:05)
    Run ID: Factor
=======================
       200: 1.00
      1000: 1.11
      5000: 1.68
     25000: 4.65
    125000: 18.90
    625000: 87.43

Collection for run: mixed_c33_run_c_a
Timed run ID:        200    (21.06.2021 21:30:07)
Timed run ID:       1000    (21.06.2021 21:30:10)
Timed run ID:       5000    (21.06.2021 21:30:14)
Timed run ID:      25000    (21.06.2021 21:30:16)
Timed run ID:     125000    (21.06.2021 21:30:23)
Timed run ID:     625000    (21.06.2021 21:30:25)
    Run ID: Factor
=======================
       200: 1.00
      1000: 1.21
      5000: 2.21
     25000: 9.27
    125000: 33.80
    625000: 125.21

Collection for run: no_check_run_c_a
Timed run ID:        200    (21.06.2021 21:30:27)
Timed run ID:       1000    (21.06.2021 21:30:29)
Timed run ID:       5000    (21.06.2021 21:30:31)
Timed run ID:      25000    (21.06.2021 21:30:33)
Timed run ID:     125000    (21.06.2021 21:30:35)
Timed run ID:     625000    (21.06.2021 21:30:37)
    Run ID: Factor
=======================
       200: 1.00
      1000: 1.00
    625000: 1.00
    125000: 1.00
      5000: 1.00
     25000: 1.01

Collection for run: diff_run_c_a
Timed run ID:        200    (21.06.2021 21:30:39)
Timed run ID:       1000    (21.06.2021 21:30:41)
Timed run ID:       5000    (21.06.2021 21:30:45)
Timed run ID:      25000    (21.06.2021 21:30:59)
Timed run ID:     125000    (21.06.2021 21:31:05)
Timed run ID:     625000    (21.06.2021 21:31:07)
    Run ID: Factor
=======================
       200: 1.00
      1000: 1.15
      5000: 1.97
     25000: 6.73
    125000: 26.58
    625000: 123.95

Collection for run: equal_p625000_run_c_a
Timed run ID:        200    (21.06.2021 21:31:10)
Timed run ID:       1000    (21.06.2021 21:31:12)
Timed run ID:       5000    (21.06.2021 21:31:19)
Timed run ID:      25000    (21.06.2021 21:31:21)
Timed run ID:     125000    (21.06.2021 21:31:32)
Timed run ID:     625000    (21.06.2021 21:31:37)
    Run ID: Factor
=======================
       200: 1.00
      1000: 1.34
      5000: 2.89
     25000: 11.25
    125000: 50.06
    625000: 247.90

#### Run c b

[30]:
case_list = [
    'equal_c50',
    'mixed_c50',
    'equal_c33',
    'mixed_c33',
    'no_check',
    'diff',
    'equal_p625000'
]
[31]:
checker = _types.SimilarityCheckerExtScreensorted()
na = _types.NeighboursExtVectorCPPUnorderedSet(1)
nb = _types.NeighboursExtVectorCPPUnorderedSet(1)
[ ]:
run_name = "run_c_b"
for case_name in case_list:
    full_run_name = f"{case_name}_{run_name}"
    report_file = report_dir / f"{full_run_name}.json"

    print(f"Collection for run: {full_run_name}")
    RUN_TIMINGS_MAP[full_run_name] = {}
    collect_timings(
        checker,
        na,
        nb,
        CASES_MAP[case_name],
        timings=RUN_TIMINGS_MAP[full_run_name]
    )

    helper.save_report(RUN_TIMINGS_MAP[full_run_name], report_file, overwrite=True)

    helper.print_ratios(helper.get_ratios(RUN_TIMINGS_MAP[full_run_name]))

    print()
Collection for run: equal_c50_run_c_b
Timed run ID:        200    (21.06.2021 21:33:48)
Timed run ID:       1000    (21.06.2021 21:33:51)
Timed run ID:       5000    (21.06.2021 21:33:56)
Timed run ID:      25000    (21.06.2021 21:34:10)
[ ]:
RUN_TIMINGS_MAP.keys()

Scaling

Fit for empirical growth function:

$ t = a n^b + c$

[ ]:
def growth(n, a, b):
    return a * n**b

def growth_with_c(n, a, b, c):
    return a * n**b + c

def scale(x, y, newx, f=growth):

    try:
        popt, pcov = curve_fit(f, x, y, p0=(0.1, 1.5))
        perr = np.sqrt(np.diag(pcov))
    except RuntimeError as error:
        print(error)
    else:
        return growth(newx, *popt), (popt, perr)
[ ]:
if "RUN_SCALING_MAP" not in dir():
    RUN_SCALING_MAP = {}
[ ]:
for full_run_name, timings in RUN_TIMINGS_MAP.items():
    print(full_run_name)
    x, y = zip(*((int(k), v.best) for k, v in timings.items()))
    x = np.asarray(x)
    y = np.asarray(y)
    sorti = np.argsort(x)
    x = x[sorti]
    y = y[sorti]

    newx = np.linspace(x[0], x[-1], 100)
    fity, (popt, perr) = scale(x, y, newx)
    RUN_SCALING_MAP[full_run_name] = (newx, fity), (popt[1], perr[1])

Plots

Run a a

[220]:
run_name_list = [
    (k, k.replace("_run_a_a", "").replace("_", " "))
    for k in RUN_TIMINGS_MAP.keys()
    if "run_a_a" in k
]
[232]:
plt.close("all")
fig, log_ax = plt.subplots()
log_ax.set_prop_cycle(
    marker=marker_list,
    color=color_list,
)
_ = evaluate_timings(
    RUN_TIMINGS_MAP, RUN_SCALING_MAP, run_name_list,
    log_ax=log_ax,
    plot_props={
        "linestyle": "",
        "markeredgecolor": "k",
        "markeredgewidth": 0.75,
        },
    legend_props={
        "loc": (0, 1.01)
        }
    )

log_ax.set_title(" " * 10 + "Contains vector")
log_ax.set_xlabel("#points")
log_ax.set_ylabel("time / s")
[232]:
Text(0, 0.5, 'time / s')
../_images/benchmark_bm_similarity_70_1.png

Run a b

[234]:
run_name_list = [
    (k, k.replace("_run_a_b", "").replace("_", " "))
    for k in RUN_TIMINGS_MAP.keys()
    if "run_a_b" in k
]
[235]:
plt.close("all")
fig, log_ax = plt.subplots()
log_ax.set_prop_cycle(
    marker=marker_list,
    color=color_list,
)
_ = evaluate_timings(
    RUN_TIMINGS_MAP, RUN_SCALING_MAP, run_name_list,
    log_ax=log_ax,
    plot_props={
        "linestyle": "",
        "markeredgecolor": "k",
        "markeredgewidth": 0.75,
        },
    legend_props={
        "loc": (0, 1.01)
        }
    )

log_ax.set_title(" " * 10 + "Contains UnorderedSet")
log_ax.set_xlabel("#points")
log_ax.set_ylabel("time / s")
[235]:
Text(0, 0.5, 'time / s')
../_images/benchmark_bm_similarity_73_1.png

Run a c

[236]:
run_name_list = [
    (k, k.replace("_run_a_c", "").replace("_", " "))
    for k in RUN_TIMINGS_MAP.keys()
    if "run_a_c" in k
]
[237]:
plt.close("all")
fig, log_ax = plt.subplots()
log_ax.set_prop_cycle(
    marker=marker_list,
    color=color_list,
)
_ = evaluate_timings(
    RUN_TIMINGS_MAP, RUN_SCALING_MAP, run_name_list,
    log_ax=log_ax,
    plot_props={
        "linestyle": "",
        "markeredgecolor": "k",
        "markeredgewidth": 0.75,
        },
    legend_props={
        "loc": (0, 1.01)
        }
    )

log_ax.set_title(" " * 10 + "Contains Vector/UnorderedSet")
log_ax.set_xlabel("#points")
log_ax.set_ylabel("time / s")
[237]:
Text(0, 0.5, 'time / s')
../_images/benchmark_bm_similarity_76_1.png

Run b a

[28]:
run_name_list = [
    (k, k.replace("_run_b_a", "").replace("_", " "))
    for k in RUN_TIMINGS_MAP.keys()
    if "run_b_a" in k
]
[29]:
plt.close("all")
fig, log_ax = plt.subplots()
log_ax.set_prop_cycle(
    marker=marker_list,
    color=color_list,
)
_ = evaluate_timings(
    RUN_TIMINGS_MAP, RUN_SCALING_MAP, run_name_list,
    log_ax=log_ax,
    plot_props={
        "linestyle": "",
        "markeredgecolor": "k",
        "markeredgewidth": 0.75,
        },
    legend_props={
        "loc": (0, 1.01)
        }
    )

log_ax.set_title(" " * 10 + "Switch Vector")
log_ax.set_xlabel("#points")
log_ax.set_ylabel("time / s")
[29]:
Text(0, 0.5, 'time / s')
../_images/benchmark_bm_similarity_79_1.png

Compare switch/no switch

[248]:
run_name_list = [
    ("mixed_c50_run_a_a", "Contains (Vector)"),
    # ("mixed_c50_run_a_b", "Contains (Un.Set)"),
    ("mixed_c50_run_a_c", "Contains (Vector/Un.Set)"),
    ("mixed_c50_run_b_a", "Switch (Vector)"),
    ("mixed_c50_run_b_c", "Switch (Vector/Un.Set)"),
]
[249]:
plt.close("all")
fig, log_ax = plt.subplots()
log_ax.set_prop_cycle(
    marker=marker_list,
    color=color_list,
)
_ = evaluate_timings(
    RUN_TIMINGS_MAP, RUN_SCALING_MAP, run_name_list,
    log_ax=log_ax,
    plot_props={
        "linestyle": "",
        "markeredgecolor": "k",
        "markeredgewidth": 0.75,
        },
    legend_props={
        "loc": (0, 1.01)
        }
    )

log_ax.set_title(" " * 20 + "Switch/No switch")
log_ax.set_xlabel("#points")
log_ax.set_ylabel("time / s")
[249]:
Text(0, 0.5, 'time / s')
../_images/benchmark_bm_similarity_82_1.png

Compare worst case

[ ]:
run_name_list = [
    ("diff_run_a_a", "Contains (Vector)"),
    ("diff_run_a_c", "Contains (Vector/Un.Set)"),
    ("diff_run_b_a", "Switch (Vector)"),
    ("diff_run_b_c", "Switch (Vector/Un.Set)"),
    ("diff_run_c_a", "Screen (Vector)"),
    ("diff_run_c_b", "Screen (Vector/Un.Set)"),
]
[ ]:
plt.close("all")
fig, log_ax = plt.subplots()
log_ax.set_prop_cycle(
    marker=marker_list,
    color=color_list,
)
_ = evaluate_timings(
    RUN_TIMINGS_MAP, RUN_SCALING_MAP, run_name_list,
    log_ax=log_ax,
    plot_props={
        "linestyle": "",
        "markeredgecolor": "k",
        "markeredgewidth": 0.75,
        },
    legend_props={
        "loc": (0, 1.01)
        }
    )

log_ax.set_title(" " * 20 + "Worst case")
log_ax.set_xlabel("#points")
log_ax.set_ylabel("time / s")