Benchmark clustering performance

Remember to compile cnnclustering with TRACE_CYTHON=0 if timings should be measured. TRACE_CYTHON=1 is required for line profiling and test coverage and will cause substantial overhead.

Table of Contents

  • 1  Function definitions

    • 1.1  Benchmark helper

      • 1.1.1  Collect

      • 1.1.2  Initialisation

    • 1.2  Timed functions

    • 1.3  Data set generation functions

    • 1.4  Data transformation

    • 1.5  Plotting

  • 2  Runs

    • 2.1  Example

    • 2.2  CommonNN Clustering

      • 2.2.1  Run a

      • 2.2.2  Run b

      • 2.2.3  Run c

      • 2.2.4  Run c b

      • 2.2.5  Run c c

      • 2.2.6  Run d

      • 2.2.7  Run d b

      • 2.2.8  Run d c

      • 2.2.9  Plots

[82]:
from datetime import datetime
import json
from operator import itemgetter
import os
import pathlib
import time
import timeit

import cnnclustering
from cnnclustering import cluster
from cnnclustering import _fit, _primitive_types, _types
import matplotlib as mpl
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes, mark_inset
from sklearn import datasets
from sklearn.metrics import pairwise_distances
from sklearn.neighbors import KDTree
from sklearn.preprocessing import StandardScaler
from tqdm.notebook import tqdm

import helper
[32]:
mpl.rcParams["figure.dpi"] = 300
[3]:
print(f"{'Python':>20} :  ", *sys.version.splitlines())
              Python :   3.8.8 | packaged by conda-forge | (default, Feb 20 2021, 16:22:27)  [GCC 9.3.0]
[4]:
print(f"cnnclustering :  ", cnnclustering.__version__)
cnnclustering :   0.3.11

Function definitions

Benchmark helper

Collect

[6]:
def collect_timings(
        gen_func, setup_func, run_arguments_list,
        transform_func=None, timings=None, repeats=10):
    """Orchestrate timings

    Args:
        gen_func: A function, returning data. Called with
            run arguments "gen".
        setup_func: A function, accepting data and returning a
            function which should be timed. Called with
            run arguments "setup".
        run_argumens_list: A list of run arguments.

    Keyword args:
        transform_func: A function, transforming generated data before setup.
        timings: An optional timings mapping which results should
            put into.
        repeats: Repeats the timing *n* times. Using timeit -n/-r directly would
            not ensure running the setup before each timing.

    Returns:
        timings mapping
    """

    # Timed function has to be in global namespace to be discovered by %timeit magic
    global timed_args
    global timed_kwargs
    global timed_func

    if timings is None:
        timings = {}

    progress = tqdm(run_arguments_list, desc="Run completed")

    for run_index, arguments in enumerate(progress):

        gen_args, gen_kwargs = arguments.get("gen", ((), {}))
        data = gen_func(*gen_args, **gen_kwargs)

        if transform_func is not None:
            trans_args, trans_kwargs = arguments.get("transform", ((), {}))
            data = transform_func(data, *trans_args, **trans_kwargs)

        timeit_results = []
        for _ in range(repeats):
            try:
                setup_args, setup_kwargs = arguments.get("setup", ((), {}))
                timed_func = setup_func(data, *setup_args, **setup_kwargs)

                timed_args, timed_kwargs = arguments.get("timed", ((), {}))
                o = %timeit -n 1 -r 1 -q -o timed_func(*timed_args, **timed_kwargs)
            except MemoryError:
                o = "MemoryError"
                break

            finally:
                timeit_results.append(o)

        run_id = arguments.get("id", str(run_index))
        tqdm.write(f"Timed run ID: {run_id:>10}    ({datetime.now().strftime('%d.%m.%Y %H:%M:%S')})")
        timings[run_id] = combine_timeit_results(*timeit_results)

    return timings

Initialisation

[12]:
def make_maps():
    """Initialise benchmark setting/result mappings"""

    global RUN_ARGUMENTS_MAP
    global RUN_TIMINGS_MAP

    if "RUN_ARGUMENTS_MAP" not in dir():
        RUN_ARGUMENTS_MAP = {}

    if "RUN_TIMINGS_MAP" not in dir():
        RUN_TIMINGS_MAP = {}
[13]:
def del_maps():
    %xdel RUN_ARGUMENTS_MAP
    %xdel RUN_TIMINGS_MAP

Timed functions

[14]:
def setup_sleep(data):
    """Dummy example"""

    return time.sleep
[15]:
def setup_commonnn_clustering__fit(data, preparation_hook=cluster.prepare_pass, recipe=None):
    """Prepare benchmark of :meth:`cnnclustering.cluster.Clustering._fit`"""

    if recipe is None:
        recipe = {}

    clustering = cluster.prepare_clustering(data, preparation_hook, **recipe)
    clustering._labels = _types.Labels(
        np.zeros(clustering._input_data.n_points, order="C", dtype=_primitive_types.P_AINDEX)
        )

    return clustering._fit
[16]:
def setup_commonnn_clustering_fit(data, preparation_hook=cluster.prepare_pass, recipe=None):
    """Prepare benchmark of :meth:`cnnclustering.cluster.Clustering.fit`"""

    if recipe is None:
        recipe = {}

    clustering = cluster.prepare_clustering(data, preparation_hook, **recipe)

    return clustering.fit

Data set generation functions

[17]:
def gen_dummy():
    return []

Data transformation

[19]:
def compute_neighbours(data, radius, sort=False):

    tree = KDTree(data)
    neighbourhoods = tree.query_radius(
        data, r=radius, return_distance=False
        )

    if sort:
        for n in neighbourhoods:
            n.sort()

    return neighbourhoods

Plotting

[21]:
def annotate_memory_error(
        ax, line, memory_error_id):

    last_sample_pos = line.get_xydata()[-1]
    memory_error_pos = np.array([memory_error_id, last_sample_pos[-1]])
    ax.plot(
        *np.vstack([last_sample_pos, memory_error_pos]).T,
        color="k",
        linestyle="--",
        zorder=0
    )
    ax.plot(
        *memory_error_pos,
        color=line.get_color(),
        marker="X",
        markeredgecolor="k",
        zorder=0
    )

Runs

Example

[25]:
example_run = [
    {
        "id": "1",
        "gen": (
            (), {}
        ),
        "setup": (
            (), {}
        ),
        "timed": (
            (0.1,), {
            }
        ),
    },
    {
        "id": "2",
        "gen": (
            (), {}
        ),
        "setup": (
            (), {}
        ),
        "timed": (
            (0.2,), {
            }
        ),
    }
]

sleep_timings = collect_timings(
    gen_dummy,
    setup_sleep,
    example_run
)

print()
print_ratios(get_ratios(sleep_timings))
Timed run ID:          1    (06.05.2021 16:49:54)
Timed run ID:          2    (06.05.2021 16:49:56)

    Run ID: Factor
=======================
         1: 1.00
         2: 2.00
[26]:
sleep_timings
[26]:
{'1': <TimeitResult : 100 ms ± 2.42 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '2': <TimeitResult : 200 ms ± 3.87 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>}
[27]:
fig, ax = plt.subplots(figsize=(1, 1/1.618))
plot_timings(sleep_timings, ax=ax, id_to_x=lambda x: int(x))
../_images/benchmark_bm_28_0.png

CommonNN Clustering

[22]:
report_dir = pathlib.Path("reports/qcm07/cnnclustering_fit")

Clustering of a 2D toy data set (no structure) with increasing number of points while dimensionality, cluster parameters, and cluster recipe are fixed:

  • Default recipe:

    • \(d = 2\), \(r = 0.1\), \(c = 0\) : run a

  • Distance recipe:

    • \(d = 2\), \(r = 0.1\), \(c = 0\) : run b

  • Neighbours recipe:

    • \(d = 2\), \(r = 0.1\), \(c = 0\) : run c

  • Neighbours recipe:

    • \(d = 2\), \(r = 0.1\), \(c = 0\) : run d

    • \(d = 2\), \(r = 0.1\), \(c = 100\) : run d b

    • \(d = 2\), \(r = 0.3\), \(c = 0\) : run d c

[29]:
# The test data
fig, ax = plt.subplots(figsize=(1, 1))
ax.scatter(
    *helper.gen_no_structure_points((5000, 2)).T,
    s=0.5,

)
[29]:
<matplotlib.collections.PathCollection at 0x7f26f1d37a00>
../_images/benchmark_bm_32_1.png
[23]:
n_points_list = [500 * 2**x for x in range(10)]
[24]:
default_recipe = {
    "input_data": _types.InputDataExtPointsMemoryview,
    "neighbours_getter": _types.NeighboursGetterExtBruteForce,
    "neighbours": (_types.NeighboursExtVector, (5000,), {}),
    "neighbour_neighbours": (_types.NeighboursExtVector, (5000,), {}),
    "metric": _types.MetricExtEuclideanReduced,
    "similarity_checker": _types.SimilarityCheckerExtContains,
    "queue": _types.QueueExtFIFOQueue,
    "fitter": _fit.FitterExtBFS,
}
[25]:
distance_recipe = {
    "input_data": _types.InputDataExtPointsMemoryview,
    "neighbours_getter": _types.NeighboursGetterExtBruteForce,
    "neighbours": (_types.NeighboursExtVector, (5000,), {}),
    "neighbour_neighbours": (_types.NeighboursExtVector, (5000,), {}),
    "metric": _types.MetricExtPrecomputed,
    "similarity_checker": _types.SimilarityCheckerExtContains,
    "queue": _types.QueueExtFIFOQueue,
    "fitter": _fit.FitterExtBFS,
}
[26]:
neighbours_recipe = {
    "input_data": _types.InputDataExtNeighboursMemoryview,
    "neighbours_getter": _types.NeighboursGetterExtLookup,
    "neighbours": (_types.NeighboursExtVector, (5000,), {}),
    "neighbour_neighbours": (_types.NeighboursExtVector, (5000,), {}),
    "metric": _types.MetricExtDummy,
    "similarity_checker": _types.SimilarityCheckerExtContains,
    "queue": _types.QueueExtFIFOQueue,
    "fitter": _fit.FitterExtBFS,
}
[27]:
neighbours_sorted_recipe = {
    "input_data": _types.InputDataExtNeighboursMemoryview,
    "neighbours_getter": _types.NeighboursGetterExtLookup,
    "neighbours": (_types.NeighboursExtVector, (5000,), {}),
    "neighbour_neighbours": (_types.NeighboursExtVector, (5000,), {}),
    "metric": _types.MetricExtDummy,
    "similarity_checker": _types.SimilarityCheckerExtScreensorted,
    "queue": _types.QueueExtFIFOQueue,
    "fitter": _fit.FitterExtBFS,
}
[28]:
make_maps()
[29]:
run_name_list = [
    'no_structure_run_a',
    'no_structure_run_b',
    'no_structure_run_c',
    'no_structure_run_c_b',
    'no_structure_run_c_c',
    'no_structure_run_d',
    'no_structure_run_d_b',
    'no_structure_run_d_c',
]

for run_name in run_name_list:
    report_file = report_dir / f"{run_name}.json"
    RUN_TIMINGS_MAP[run_name] = load_report(report_file)

#### Run a

[36]:
run_name = "no_structure_run_a"
report_file = report_dir / f"{run_name}.json"

gen_func = helper.gen_no_structure_points
transform_func = None
setup_func = setup_commonnn_clustering__fit
[37]:
radius_cutoff = 0.1
cnn_cutoff = 0

RUN_ARGUMENTS_MAP[run_name] = []
for n_points in n_points_list:
    RUN_ARGUMENTS_MAP[run_name].append(
        {
            "id": str(n_points),
            "gen": (
                ((n_points, 2),), {}
            ),
            "setup": (
                (), {"recipe": default_recipe}
            ),
            "timed": (
                (_types.ClusterParameters(radius_cutoff, cnn_cutoff),), {}
            ),
        }
    )
[38]:
print(f"Collection for run: {run_name}")
RUN_TIMINGS_MAP[run_name] = {}
collect_timings(
    gen_func,
    setup_func,
    RUN_ARGUMENTS_MAP[run_name],
    transform_func=transform_func,
    timings=RUN_TIMINGS_MAP[run_name]
)
Collection for run: no_structure_run_a
Timed run ID:        500    (06.05.2021 16:49:56)
Timed run ID:       1000    (06.05.2021 16:49:56)
Timed run ID:       2000    (06.05.2021 16:49:57)
Timed run ID:       4000    (06.05.2021 16:49:58)
Timed run ID:       8000    (06.05.2021 16:50:01)
Timed run ID:      16000    (06.05.2021 16:50:16)
Timed run ID:      32000    (06.05.2021 16:51:13)
Timed run ID:      64000    (06.05.2021 16:55:05)
Timed run ID:     128000    (06.05.2021 17:11:19)
Timed run ID:     256000    (06.05.2021 18:18:45)
[38]:
{'500': <TimeitResult : 2.01 ms ± 259 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '1000': <TimeitResult : 7.54 ms ± 1.68 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '2000': <TimeitResult : 27.5 ms ± 9.73 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '4000': <TimeitResult : 89.4 ms ± 24.7 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '8000': <TimeitResult : 359 ms ± 98.9 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '16000': <TimeitResult : 1.43 s ± 394 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '32000': <TimeitResult : 5.72 s ± 1.57 s per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '64000': <TimeitResult : 23.2 s ± 6.31 s per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '128000': <TimeitResult : 1min 37s ± 29.4 s per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '256000': <TimeitResult : 6min 44s ± 2min 26s per loop (mean ± std. dev. of 10 runs, 1 loop each)>}
[39]:
save_report(RUN_TIMINGS_MAP[run_name], report_file, overwrite=True)
[40]:
print_ratios(get_ratios(RUN_TIMINGS_MAP[run_name]))
    Run ID: Factor
=======================
       500: 1.00
      1000: 3.84
      2000: 11.38
      4000: 45.04
      8000: 180.10
     16000: 720.78
     32000: 2884.09
     64000: 11527.52
    128000: 46168.29
    256000: 184473.71

#### Run b

[41]:
run_name = "no_structure_run_b"
report_file = report_dir / f"{run_name}.json"

gen_func = helper.gen_no_structure_points
transform_func = pairwise_distances
setup_func = setup_commonnn_clustering__fit
[42]:
radius_cutoff = 0.1
cnn_cutoff = 0

RUN_ARGUMENTS_MAP[run_name] = []
for n_points in [500 * 2**x for x in range(8)]:  # Memory error on 128000
    RUN_ARGUMENTS_MAP[run_name].append(
        {
            "id": str(n_points),
            "gen": (
                ((n_points, 2),), {}
            ),
            "setup": (
                (), {"recipe": distance_recipe}
            ),
            "timed": (
                (_types.ClusterParameters(radius_cutoff, cnn_cutoff),), {}
            ),
        }
    )
[43]:
print(f"Collection for run: {run_name}")
RUN_TIMINGS_MAP[run_name] = {}
collect_timings(
    gen_func,
    setup_func,
    RUN_ARGUMENTS_MAP[run_name],
    transform_func=transform_func,
    timings=RUN_TIMINGS_MAP[run_name]
)
Collection for run: no_structure_run_b
Timed run ID:        500    (06.05.2021 18:18:45)
Timed run ID:       1000    (06.05.2021 18:18:45)
Timed run ID:       2000    (06.05.2021 18:18:46)
Timed run ID:       4000    (06.05.2021 18:18:46)
Timed run ID:       8000    (06.05.2021 18:18:52)
Timed run ID:      16000    (06.05.2021 18:19:06)
Timed run ID:      32000    (06.05.2021 18:20:06)
Timed run ID:      64000    (06.05.2021 18:24:16)
[43]:
{'500': <TimeitResult : 724 µs ± 108 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '1000': <TimeitResult : 2.8 ms ± 629 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '2000': <TimeitResult : 12.7 ms ± 4.81 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '4000': <TimeitResult : 64.1 ms ± 23.3 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '8000': <TimeitResult : 454 ms ± 196 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '16000': <TimeitResult : 1.17 s ± 239 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '32000': <TimeitResult : 4.25 s ± 33.7 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '64000': <TimeitResult : 17.1 s ± 64.7 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>}
[44]:
RUN_TIMINGS_MAP[run_name]["128000"] = "MemoryError"
[45]:
save_report(RUN_TIMINGS_MAP[run_name], report_file, overwrite=True)
[46]:
print_ratios(get_ratios(RUN_TIMINGS_MAP[run_name], which="best"))
    Run ID: Factor
=======================
       500: 1.00
      1000: 3.60
      2000: 11.49
      4000: 47.19
      8000: 367.65
     16000: 1498.82
     32000: 6146.07
     64000: 24953.98

#### Run c

[47]:
run_name = "no_structure_run_c"
report_file = report_dir / f"{run_name}.json"

gen_func = helper.gen_no_structure_points
transform_func = compute_neighbours
setup_func = setup_commonnn_clustering__fit
[48]:
RUN_ARGUMENTS_MAP[run_name] = []

radius_cutoff = 0.1
cnn_cutoff = 0
for n_points in n_points_list:
    RUN_ARGUMENTS_MAP[run_name].append(
        {
            "id": str(n_points),
            "gen": (
                ((n_points, 2),), {}
            ),
            "transform": (
                (radius_cutoff,), {}
            ),
            "setup": (
                (), {
                    "preparation_hook": cluster.prepare_neighbourhoods,
                    "recipe": neighbours_recipe
                    }
            ),
            "timed": (
                (_types.ClusterParameters(radius_cutoff, cnn_cutoff),), {}
            ),
        }
    )
[49]:
print(f"Collection for run: {run_name}")
RUN_TIMINGS_MAP[run_name] = {}
collect_timings(
    gen_func,
    setup_func,
    RUN_ARGUMENTS_MAP[run_name],
    transform_func=transform_func,
    timings=RUN_TIMINGS_MAP[run_name]
)
Collection for run: no_structure_run_c
Timed run ID:        500    (06.05.2021 18:24:17)
Timed run ID:       1000    (06.05.2021 18:24:17)
Timed run ID:       2000    (06.05.2021 18:24:17)
Timed run ID:       4000    (06.05.2021 18:24:18)
Timed run ID:       8000    (06.05.2021 18:24:20)
Timed run ID:      16000    (06.05.2021 18:24:24)
Timed run ID:      32000    (06.05.2021 18:24:32)
Timed run ID:      64000    (06.05.2021 18:24:49)
Timed run ID:     128000    (06.05.2021 18:25:28)
Timed run ID:     256000    (06.05.2021 18:27:06)
[49]:
{'500': <TimeitResult : 83.8 µs ± 32 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '1000': <TimeitResult : 93.1 µs ± 19.6 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '2000': <TimeitResult : 185 µs ± 92.6 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '4000': <TimeitResult : 965 µs ± 577 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '8000': <TimeitResult : 7.92 ms ± 5.55 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '16000': <TimeitResult : 13.9 ms ± 9.04 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '32000': <TimeitResult : 40.7 ms ± 8.03 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '64000': <TimeitResult : 138 ms ± 21.4 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '128000': <TimeitResult : 530 ms ± 70.4 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '256000': <TimeitResult : 2.26 s ± 259 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>}
[50]:
save_report(RUN_TIMINGS_MAP[run_name], report_file, overwrite=True)
[51]:
print_ratios(get_ratios(RUN_TIMINGS_MAP[run_name], which="best"))
    Run ID: Factor
=======================
       500: 1.00
      1000: 1.13
      2000: 1.45
      4000: 2.68
      8000: 26.48
     16000: 81.55
     32000: 409.02
     64000: 1472.91
    128000: 5848.36
    256000: 25515.87

#### Run c b

[52]:
run_name = "no_structure_run_c_b"
report_file = report_dir / f"{run_name}.json"

gen_func = helper.gen_no_structure_points
transform_func = compute_neighbours
setup_func = setup_commonnn_clustering__fit
[53]:
RUN_ARGUMENTS_MAP[run_name] = []

radius_cutoff = 0.1
cnn_cutoff = 100
for n_points in n_points_list:
    RUN_ARGUMENTS_MAP[run_name].append(
        {
            "id": str(n_points),
            "gen": (
                ((n_points, 2),), {}
            ),
            "transform": (
                (radius_cutoff,), {}
            ),
            "setup": (
                (), {
                    "preparation_hook": cluster.prepare_neighbourhoods,
                    "recipe": neighbours_recipe
                    }
            ),
            "timed": (
                (_types.ClusterParameters(radius_cutoff, cnn_cutoff),), {}
            ),
        }
    )
[54]:
print(f"Collection for run: {run_name}")
RUN_TIMINGS_MAP[run_name] = {}
collect_timings(
    gen_func,
    setup_func,
    RUN_ARGUMENTS_MAP[run_name],
    transform_func=transform_func,
    timings=RUN_TIMINGS_MAP[run_name]
)
Collection for run: no_structure_run_c_b
Timed run ID:        500    (06.05.2021 18:27:06)
Timed run ID:       1000    (06.05.2021 18:27:07)
Timed run ID:       2000    (06.05.2021 18:27:07)
Timed run ID:       4000    (06.05.2021 18:27:08)
Timed run ID:       8000    (06.05.2021 18:27:10)
Timed run ID:      16000    (06.05.2021 18:27:13)
Timed run ID:      32000    (06.05.2021 18:27:21)
Timed run ID:      64000    (06.05.2021 18:30:14)
Timed run ID:     128000    (06.05.2021 18:31:14)
Timed run ID:     256000    (06.05.2021 18:33:43)
[54]:
{'500': <TimeitResult : 73.3 µs ± 7.55 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '1000': <TimeitResult : 82.6 µs ± 2.25 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '2000': <TimeitResult : 106 µs ± 952 ns per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '4000': <TimeitResult : 182 µs ± 3.08 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '8000': <TimeitResult : 481 µs ± 103 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '16000': <TimeitResult : 1.54 ms ± 134 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '32000': <TimeitResult : 24.3 ms ± 26.5 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '64000': <TimeitResult : 15.7 s ± 2.49 s per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '128000': <TimeitResult : 2.54 s ± 409 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '256000': <TimeitResult : 7.16 s ± 329 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>}
[55]:
save_report(RUN_TIMINGS_MAP[run_name], report_file, overwrite=True)
[56]:
print_ratios(get_ratios(RUN_TIMINGS_MAP[run_name], which="best"))
    Run ID: Factor
=======================
       500: 1.00
      1000: 1.14
      2000: 1.48
      4000: 2.56
      8000: 6.30
     16000: 21.03
     32000: 89.47
    128000: 29191.25
    256000: 94533.45
     64000: 168392.29

#### Run c c

[57]:
run_name = "no_structure_run_c_c"
report_file = report_dir / f"{run_name}.json"

gen_func = helper.gen_no_structure_points
transform_func = compute_neighbours
setup_func = setup_commonnn_clustering__fit
[58]:
RUN_ARGUMENTS_MAP[run_name] = []

radius_cutoff = 0.3
cnn_cutoff = 0
for n_points in n_points_list:
    RUN_ARGUMENTS_MAP[run_name].append(
        {
            "id": str(n_points),
            "gen": (
                ((n_points, 2),), {}
            ),
            "transform": (
                (radius_cutoff,), {}
            ),
            "setup": (
                (), {
                    "preparation_hook": cluster.prepare_neighbourhoods,
                    "recipe": neighbours_recipe
                    }
            ),
            "timed": (
                (_types.ClusterParameters(radius_cutoff, cnn_cutoff),), {}
            ),
        }
    )
[59]:
print(f"Collection for run: {run_name}")
RUN_TIMINGS_MAP[run_name] = {}
collect_timings(
    gen_func,
    setup_func,
    RUN_ARGUMENTS_MAP[run_name],
    transform_func=transform_func,
    timings=RUN_TIMINGS_MAP[run_name]
)
Collection for run: no_structure_run_c_c
Timed run ID:        500    (06.05.2021 18:33:43)
Timed run ID:       1000    (06.05.2021 18:33:44)
Timed run ID:       2000    (06.05.2021 18:33:44)
Timed run ID:       4000    (06.05.2021 18:33:45)
Timed run ID:       8000    (06.05.2021 18:33:47)
Timed run ID:      16000    (06.05.2021 18:33:52)
Timed run ID:      32000    (06.05.2021 18:34:05)
Timed run ID:      64000    (06.05.2021 18:34:46)
Timed run ID:     128000    (06.05.2021 18:37:36)
Timed run ID:     256000    (06.05.2021 18:54:39)
[59]:
{'500': <TimeitResult : 187 µs ± 76.8 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '1000': <TimeitResult : 1.09 ms ± 855 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '2000': <TimeitResult : 1.59 ms ± 722 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '4000': <TimeitResult : 4.27 ms ± 919 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '8000': <TimeitResult : 16.7 ms ± 2.68 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '16000': <TimeitResult : 72 ms ± 9.88 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '32000': <TimeitResult : 355 ms ± 42.4 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '64000': <TimeitResult : 1.84 s ± 189 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '128000': <TimeitResult : 10.8 s ± 892 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '256000': <TimeitResult : 1min 17s ± 4.69 s per loop (mean ± std. dev. of 10 runs, 1 loop each)>}
[60]:
save_report(RUN_TIMINGS_MAP[run_name], report_file, overwrite=True)
[61]:
print_ratios(get_ratios(RUN_TIMINGS_MAP[run_name], which="best"))
    Run ID: Factor
=======================
       500: 1.00
      1000: 3.46
      2000: 10.44
      4000: 36.08
      8000: 155.47
     16000: 704.80
     32000: 3539.87
     64000: 18977.51
    128000: 115620.64
    256000: 863166.43

#### Run d

[62]:
run_name = "no_structure_run_d"
report_file = report_dir / f"{run_name}.json"

gen_func = helper.gen_no_structure_points
transform_func = compute_neighbours
setup_func = setup_commonnn_clustering__fit
[63]:
RUN_ARGUMENTS_MAP[run_name] = []

radius_cutoff = 0.1
cnn_cutoff = 0
for n_points in n_points_list:
    RUN_ARGUMENTS_MAP[run_name].append(
        {
            "id": str(n_points),
            "gen": (
                ((n_points, 2),), {}
            ),
            "transform": (
                (radius_cutoff,), {"sort": True}
            ),
            "setup": (
                (), {
                    "preparation_hook": cluster.prepare_neighbourhoods,
                    "recipe": neighbours_sorted_recipe
                    }
            ),
            "timed": (
                (_types.ClusterParameters(radius_cutoff, cnn_cutoff),), {}
            ),
        }
    )
[64]:
print(f"Collection for run: {run_name}")
RUN_TIMINGS_MAP[run_name] = {}
collect_timings(
    gen_func,
    setup_func,
    RUN_ARGUMENTS_MAP[run_name],
    transform_func=transform_func,
    timings=RUN_TIMINGS_MAP[run_name]
)
Collection for run: no_structure_run_d
Timed run ID:        500    (06.05.2021 18:54:40)
Timed run ID:       1000    (06.05.2021 18:54:40)
Timed run ID:       2000    (06.05.2021 18:54:41)
Timed run ID:       4000    (06.05.2021 18:54:42)
Timed run ID:       8000    (06.05.2021 18:54:44)
Timed run ID:      16000    (06.05.2021 18:54:48)
Timed run ID:      32000    (06.05.2021 18:54:55)
Timed run ID:      64000    (06.05.2021 18:55:13)
Timed run ID:     128000    (06.05.2021 18:55:51)
Timed run ID:     256000    (06.05.2021 18:57:25)
[64]:
{'500': <TimeitResult : 81.9 µs ± 25.6 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '1000': <TimeitResult : 94.6 µs ± 17.3 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '2000': <TimeitResult : 208 µs ± 126 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '4000': <TimeitResult : 774 µs ± 427 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '8000': <TimeitResult : 5.18 ms ± 3 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '16000': <TimeitResult : 9.47 ms ± 3.72 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '32000': <TimeitResult : 30.8 ms ± 3.11 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '64000': <TimeitResult : 102 ms ± 6.33 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '128000': <TimeitResult : 365 ms ± 12.7 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '256000': <TimeitResult : 1.27 s ± 24.4 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>}
[65]:
previous_timings = load_report(report_file)
for run_id, timing in previous_timings.items():
    if run_id in RUN_TIMINGS_MAP[run_name]:
        combined = combine_timeit_results(
            RUN_TIMINGS_MAP[run_name][run_id], timing
        )
    else:
        combined = timing

    RUN_TIMINGS_MAP[run_name][run_id] = combined
[66]:
save_report(RUN_TIMINGS_MAP[run_name], report_file, overwrite=True)
[67]:
print_ratios(get_ratios(RUN_TIMINGS_MAP[run_name], which="best"))
    Run ID: Factor
=======================
       500: 1.00
      1000: 1.14
      2000: 1.50
      4000: 2.67
      8000: 22.84
     16000: 80.77
     32000: 359.05
     64000: 1280.41
    128000: 4773.54
    256000: 17251.74

#### Run d b

[68]:
run_name = "no_structure_run_d_b"
report_file = report_dir / f"{run_name}.json"

gen_func = helper.gen_no_structure_points
transform_func = compute_neighbours
setup_func = setup_commonnn_clustering__fit
[69]:
RUN_ARGUMENTS_MAP[run_name] = []

radius_cutoff = 0.1
cnn_cutoff = 100
for n_points in n_points_list:
    RUN_ARGUMENTS_MAP[run_name].append(
        {
            "id": str(n_points),
            "gen": (
                ((n_points, 2),), {}
            ),
            "transform": (
                (radius_cutoff,), {"sort": True}
            ),
            "setup": (
                (), {
                    "preparation_hook": cluster.prepare_neighbourhoods,
                    "recipe": neighbours_sorted_recipe
                    }
            ),
            "timed": (
                (_types.ClusterParameters(radius_cutoff, cnn_cutoff),), {}
            ),
        }
    )
[70]:
print(f"Collection for run: {run_name}")
RUN_TIMINGS_MAP[run_name] = {}
collect_timings(
    gen_func,
    setup_func,
    RUN_ARGUMENTS_MAP[run_name],
    transform_func=transform_func,
    timings=RUN_TIMINGS_MAP[run_name]
)
Collection for run: no_structure_run_d_b
Timed run ID:        500    (06.05.2021 18:57:25)
Timed run ID:       1000    (06.05.2021 18:57:25)
Timed run ID:       2000    (06.05.2021 18:57:26)
Timed run ID:       4000    (06.05.2021 18:57:27)
Timed run ID:       8000    (06.05.2021 18:57:28)
Timed run ID:      16000    (06.05.2021 18:57:32)
Timed run ID:      32000    (06.05.2021 18:57:40)
Timed run ID:      64000    (06.05.2021 18:58:22)
Timed run ID:     128000    (06.05.2021 18:59:04)
Timed run ID:     256000    (06.05.2021 19:00:43)
[70]:
{'500': <TimeitResult : 78.8 µs ± 9.26 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '1000': <TimeitResult : 88 µs ± 11 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '2000': <TimeitResult : 109 µs ± 4.72 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '4000': <TimeitResult : 185 µs ± 3.57 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '8000': <TimeitResult : 452 µs ± 9.96 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '16000': <TimeitResult : 1.5 ms ± 14.5 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '32000': <TimeitResult : 11.1 ms ± 6.44 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '64000': <TimeitResult : 2.65 s ± 443 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '128000': <TimeitResult : 625 ms ± 37.4 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '256000': <TimeitResult : 1.71 s ± 26.1 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>}
[71]:
save_report(RUN_TIMINGS_MAP[run_name], report_file, overwrite=True)
[72]:
print_ratios(get_ratios(RUN_TIMINGS_MAP[run_name], which="best"))
    Run ID: Factor
=======================
       500: 1.00
      1000: 1.10
      2000: 1.43
      4000: 2.43
      8000: 5.96
     16000: 19.97
     32000: 87.27
    128000: 7739.12
    256000: 22501.85
     64000: 26586.38

#### Run d c

[73]:
run_name = "no_structure_run_d_c"
report_file = report_dir / f"{run_name}.json"

gen_func = helper.gen_no_structure_points
transform_func = compute_neighbours
setup_func = setup_commonnn_clustering__fit
[74]:
RUN_ARGUMENTS_MAP[run_name] = []

radius_cutoff = 0.3
cnn_cutoff = 0
for n_points in n_points_list:
    RUN_ARGUMENTS_MAP[run_name].append(
        {
            "id": str(n_points),
            "gen": (
                ((n_points, 2),), {}
            ),
            "transform": (
                (radius_cutoff,), {"sort": True}
            ),
            "setup": (
                (), {
                    "preparation_hook": cluster.prepare_neighbourhoods,
                    "recipe": neighbours_sorted_recipe
                    }
            ),
            "timed": (
                (_types.ClusterParameters(radius_cutoff, cnn_cutoff),), {}
            ),
        }
    )
[75]:
print(f"Collection for run: {run_name}")
RUN_TIMINGS_MAP[run_name] = {}
collect_timings(
    gen_func,
    setup_func,
    RUN_ARGUMENTS_MAP[run_name],
    transform_func=transform_func,
    timings=RUN_TIMINGS_MAP[run_name]
)
Collection for run: no_structure_run_d_c
Timed run ID:        500    (06.05.2021 19:00:43)
Timed run ID:       1000    (06.05.2021 19:00:43)
Timed run ID:       2000    (06.05.2021 19:00:44)
Timed run ID:       4000    (06.05.2021 19:00:45)
Timed run ID:       8000    (06.05.2021 19:00:47)
Timed run ID:      16000    (06.05.2021 19:00:52)
Timed run ID:      32000    (06.05.2021 19:01:03)
Timed run ID:      64000    (06.05.2021 19:01:34)
Timed run ID:     128000    (06.05.2021 19:03:09)
Timed run ID:     256000    (06.05.2021 19:08:48)
[75]:
{'500': <TimeitResult : 163 µs ± 58.9 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '1000': <TimeitResult : 747 µs ± 484 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '2000': <TimeitResult : 1.08 ms ± 309 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '4000': <TimeitResult : 2.97 ms ± 398 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '8000': <TimeitResult : 10.1 ms ± 666 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '16000': <TimeitResult : 39.4 ms ± 1.34 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '32000': <TimeitResult : 150 ms ± 3.93 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '64000': <TimeitResult : 566 ms ± 6.94 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '128000': <TimeitResult : 2.2 s ± 10.9 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
 '256000': <TimeitResult : 8.99 s ± 26.7 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>}
[76]:
save_report(RUN_TIMINGS_MAP[run_name], report_file, overwrite=True)
[77]:
print_ratios(get_ratios(RUN_TIMINGS_MAP[run_name], which="best"))
    Run ID: Factor
=======================
       500: 1.00
      1000: 3.58
      2000: 8.80
      4000: 30.28
      8000: 113.11
     16000: 464.12
     32000: 1816.65
     64000: 6954.94
    128000: 27366.46
    256000: 112138.82

Plots

[30]:
figsrc_dir = pathlib.Path(os.path.expandvars("$WD/CommonNN/Manuscript/figsrc/Benchmark"))
[81]:
mpl.rcParams["font.size"] = 8
mpl.rcParams["axes.labelpad"] = 2
mpl.rcParams["xtick.major.pad"] = 2
mpl.rcParams["xtick.minor.pad"] = 1.9
mpl.rcParams["ytick.major.pad"] = 2
mpl.rcParams["ytick.minor.pad"] = 1.9

fig, (normal_ax, log_ax) = plt.subplots(2, 1, figsize=(3.33, 3.33/1.618))

run_name_list = [
    ('no_structure_run_a', 'points'),
    ('no_structure_run_b', 'distances'),
    ('no_structure_run_c', 'neighbours'),
    ('no_structure_run_d', 'sorted n.'),
]
markers = iter(["o", "v", "^", "s", "p", "P", "*", "h", "d"])

normal_ax.set_position([0.13, 0.51, 0.84, 0.40])
log_ax.set_position([0.13, 0.09, 0.84, 0.40])

# Inset
inset_ax = fig.add_axes(
    [0.125, 0.66, 0.39, 0.26],
    zorder=3,
    )

for name, label in run_name_list:
    marker = next(markers)

    plot_timings(
        RUN_TIMINGS_MAP[name],
        ax=normal_ax,
        id_to_x=lambda x: int(x),
        sort_ids=True,
        set_ax_props=False,
        plot_props={
            "label": label,
            "marker": marker,
            "markersize": 4
        }
    )

    line = normal_ax.lines[-1]
    log_line, = log_ax.plot(*line.get_xydata().T)
    saved_transform = log_line.get_transform()
    saved_clipbox = log_line.clipbox
    log_line.update_from(line)
    log_line.set_transform(saved_transform)
    log_line.clipbox = saved_clipbox

    line = normal_ax.lines[-1]
    inset_line, = inset_ax.plot(*line.get_xydata().T)
    saved_transform = inset_line.get_transform()
    saved_clipbox = inset_line.clipbox
    inset_line.update_from(line)
    inset_line.set_transform(saved_transform)
    inset_line.clipbox = saved_clipbox

    inset_line.set_markersize(3)

normal_ax.xaxis.tick_top()

normal_ax.set(**{
    "xlabel": None,
    "xlabel": None
})

log_ax.set(**{
    "xlabel": None,
    "xlabel": None,
    "xscale": "log",
    "yscale": "log",
})

inset_ax.set(**{
    "xlim": (400, 9000),
    "ylim": (-0.0005, 0.01),
    "xticks": (),
    "yticks": (),
})

mark_inset(normal_ax, inset_ax, loc1=3, loc2=4, edgecolor="k")

# Annotate memory error
annotate_memory_error(
    ax=normal_ax, line=normal_ax.get_lines()[1],
    memory_error_id=128000
)
annotate_memory_error(
    ax=log_ax, line=log_ax.get_lines()[1],
    memory_error_id=128000
)

commonax = fig.add_axes(
    [0.04, 0., 0.97, 1],
    zorder=-1,
    frame_on=False
    )

commonax.set(**{
    "xticks": (),
    "yticks": ()
})
commonax.set_ylabel("time / s", labelpad=0)

# Legend
legend = normal_ax.legend(
    fancybox=False,
    framealpha=1,
    edgecolor="k",
    fontsize="xx-small",
    loc=(0.80, 0.34)
    )
legend.get_frame().set_linewidth(0.5)

log_ax.get_xaxis().set_major_formatter(mpl.ticker.ScalarFormatter())

# fig.subplots_adjust(left=0.11, bottom=0.1, right=0.86, top=0.89, hspace=0.1)
fig.savefig(figsrc_dir / "bm_cnnclustering_fit_no_structure_a_b_c_d.png")
../_images/benchmark_bm_92_0.png
[80]:
mpl.rcParams["font.size"] = 8
mpl.rcParams["axes.labelpad"] = 2
mpl.rcParams["xtick.major.pad"] = 2
mpl.rcParams["xtick.minor.pad"] = 1.9
mpl.rcParams["ytick.major.pad"] = 2
mpl.rcParams["ytick.minor.pad"] = 1.9

fig, (normal_ax, log_ax) = plt.subplots(2, 1, figsize=(3.33, 3.33/1.618))

run_name_list = [
    ('no_structure_run_c', '0.1, 0'),
    ('no_structure_run_c_b', '0.1, 100'),
    ('no_structure_run_c_c', '0.3, 0'),
    ('no_structure_run_d', '0.1, 0, sorted'),
    ('no_structure_run_d_b', '0.1, 100, sorted'),
    ('no_structure_run_d_c', '0.3, 0, sorted'),
]
markers = iter(["o", "v", "^", "s", "p", "P", "*", "h", "d"])

normal_ax.set_position([0.13, 0.51, 0.84, 0.40])
log_ax.set_position([0.13, 0.09, 0.84, 0.40])

# Inset
inset_ax = fig.add_axes(
    [0.125, 0.66, 0.39, 0.26],
    zorder=3,
    )

for name, label in run_name_list:
    marker = next(markers)

    plot_timings(
        RUN_TIMINGS_MAP[name],
        ax=normal_ax,
        id_to_x=lambda x: int(x),
        sort_ids=True,
        set_ax_props=False,
        plot_props={
            "label": label,
            "marker": marker,
            "markersize": 4
        }
    )

    line = normal_ax.lines[-1]
    log_line, = log_ax.plot(*line.get_xydata().T)
    saved_transform = log_line.get_transform()
    saved_clipbox = log_line.clipbox
    log_line.update_from(line)
    log_line.set_transform(saved_transform)
    log_line.clipbox = saved_clipbox

    line = normal_ax.lines[-1]
    inset_line, = inset_ax.plot(*line.get_xydata().T)
    saved_transform = inset_line.get_transform()
    saved_clipbox = inset_line.clipbox
    inset_line.update_from(line)
    inset_line.set_transform(saved_transform)
    inset_line.clipbox = saved_clipbox

    inset_line.set_markersize(3)

normal_ax.xaxis.tick_top()

normal_ax.set(**{
    "xlabel": None,
    "xlabel": None
})

log_ax.set(**{
    "xlabel": None,
    "xlabel": None,
    "xscale": "log",
    "yscale": "log",
})

inset_ax.set(**{
    "xlim": (400, 9000),
    "ylim": (-0.0005, 0.005),
    "xticks": (),
    "yticks": (),
})

mark_inset(normal_ax, inset_ax, loc1=3, loc2=4, edgecolor="k")

commonax = fig.add_axes(
    [0.04, 0., 0.97, 1],
    zorder=-1,
    frame_on=False
    )

commonax.set(**{
    "xticks": (),
    "yticks": ()
})
commonax.set_ylabel("time / s", labelpad=0)

# Legend
legend = normal_ax.legend(
    fancybox=False,
    framealpha=1,
    edgecolor="k",
    fontsize="xx-small",
    loc=(0.8, 0.33)
    )
legend.get_frame().set_linewidth(0.5)

log_ax.get_xaxis().set_major_formatter(mpl.ticker.ScalarFormatter())

# fig.subplots_adjust(left=0.11, bottom=0.1, right=0.86, top=0.89, hspace=0.1)
fig.savefig(figsrc_dir / "bm_cnnclustering_fit_no_structure_c_cb_cc_d_db_dc.png")
../_images/benchmark_bm_93_0.png