Benchmark clustering performance¶
Remember to compile cnnclustering
with TRACE_CYTHON=0
if timings should be measured. TRACE_CYTHON=1
is required for line profiling and test coverage and will cause substantial overhead.
Table of Contents
1 Function definitions
1.1 Benchmark helper
1.1.1 Collect
1.1.2 Initialisation
1.2 Timed functions
1.3 Data set generation functions
1.4 Data transformation
1.5 Plotting
2 Runs
2.1 Example
2.2 CommonNN Clustering
2.2.1 Run a
2.2.2 Run b
2.2.3 Run c
2.2.4 Run c b
2.2.5 Run c c
2.2.6 Run d
2.2.7 Run d b
2.2.8 Run d c
2.2.9 Plots
[82]:
from datetime import datetime
import json
from operator import itemgetter
import os
import pathlib
import time
import timeit
import cnnclustering
from cnnclustering import cluster
from cnnclustering import _fit, _primitive_types, _types
import matplotlib as mpl
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes, mark_inset
from sklearn import datasets
from sklearn.metrics import pairwise_distances
from sklearn.neighbors import KDTree
from sklearn.preprocessing import StandardScaler
from tqdm.notebook import tqdm
import helper
[32]:
mpl.rcParams["figure.dpi"] = 300
[3]:
print(f"{'Python':>20} : ", *sys.version.splitlines())
Python : 3.8.8 | packaged by conda-forge | (default, Feb 20 2021, 16:22:27) [GCC 9.3.0]
[4]:
print(f"cnnclustering : ", cnnclustering.__version__)
cnnclustering : 0.3.11
Function definitions¶
Benchmark helper¶
Collect¶
[6]:
def collect_timings(
gen_func, setup_func, run_arguments_list,
transform_func=None, timings=None, repeats=10):
"""Orchestrate timings
Args:
gen_func: A function, returning data. Called with
run arguments "gen".
setup_func: A function, accepting data and returning a
function which should be timed. Called with
run arguments "setup".
run_argumens_list: A list of run arguments.
Keyword args:
transform_func: A function, transforming generated data before setup.
timings: An optional timings mapping which results should
put into.
repeats: Repeats the timing *n* times. Using timeit -n/-r directly would
not ensure running the setup before each timing.
Returns:
timings mapping
"""
# Timed function has to be in global namespace to be discovered by %timeit magic
global timed_args
global timed_kwargs
global timed_func
if timings is None:
timings = {}
progress = tqdm(run_arguments_list, desc="Run completed")
for run_index, arguments in enumerate(progress):
gen_args, gen_kwargs = arguments.get("gen", ((), {}))
data = gen_func(*gen_args, **gen_kwargs)
if transform_func is not None:
trans_args, trans_kwargs = arguments.get("transform", ((), {}))
data = transform_func(data, *trans_args, **trans_kwargs)
timeit_results = []
for _ in range(repeats):
try:
setup_args, setup_kwargs = arguments.get("setup", ((), {}))
timed_func = setup_func(data, *setup_args, **setup_kwargs)
timed_args, timed_kwargs = arguments.get("timed", ((), {}))
o = %timeit -n 1 -r 1 -q -o timed_func(*timed_args, **timed_kwargs)
except MemoryError:
o = "MemoryError"
break
finally:
timeit_results.append(o)
run_id = arguments.get("id", str(run_index))
tqdm.write(f"Timed run ID: {run_id:>10} ({datetime.now().strftime('%d.%m.%Y %H:%M:%S')})")
timings[run_id] = combine_timeit_results(*timeit_results)
return timings
Initialisation¶
[12]:
def make_maps():
"""Initialise benchmark setting/result mappings"""
global RUN_ARGUMENTS_MAP
global RUN_TIMINGS_MAP
if "RUN_ARGUMENTS_MAP" not in dir():
RUN_ARGUMENTS_MAP = {}
if "RUN_TIMINGS_MAP" not in dir():
RUN_TIMINGS_MAP = {}
[13]:
def del_maps():
%xdel RUN_ARGUMENTS_MAP
%xdel RUN_TIMINGS_MAP
Timed functions¶
[14]:
def setup_sleep(data):
"""Dummy example"""
return time.sleep
[15]:
def setup_commonnn_clustering__fit(data, preparation_hook=cluster.prepare_pass, recipe=None):
"""Prepare benchmark of :meth:`cnnclustering.cluster.Clustering._fit`"""
if recipe is None:
recipe = {}
clustering = cluster.prepare_clustering(data, preparation_hook, **recipe)
clustering._labels = _types.Labels(
np.zeros(clustering._input_data.n_points, order="C", dtype=_primitive_types.P_AINDEX)
)
return clustering._fit
[16]:
def setup_commonnn_clustering_fit(data, preparation_hook=cluster.prepare_pass, recipe=None):
"""Prepare benchmark of :meth:`cnnclustering.cluster.Clustering.fit`"""
if recipe is None:
recipe = {}
clustering = cluster.prepare_clustering(data, preparation_hook, **recipe)
return clustering.fit
Data set generation functions¶
[17]:
def gen_dummy():
return []
Data transformation¶
[19]:
def compute_neighbours(data, radius, sort=False):
tree = KDTree(data)
neighbourhoods = tree.query_radius(
data, r=radius, return_distance=False
)
if sort:
for n in neighbourhoods:
n.sort()
return neighbourhoods
Plotting¶
[21]:
def annotate_memory_error(
ax, line, memory_error_id):
last_sample_pos = line.get_xydata()[-1]
memory_error_pos = np.array([memory_error_id, last_sample_pos[-1]])
ax.plot(
*np.vstack([last_sample_pos, memory_error_pos]).T,
color="k",
linestyle="--",
zorder=0
)
ax.plot(
*memory_error_pos,
color=line.get_color(),
marker="X",
markeredgecolor="k",
zorder=0
)
Runs¶
Example¶
[25]:
example_run = [
{
"id": "1",
"gen": (
(), {}
),
"setup": (
(), {}
),
"timed": (
(0.1,), {
}
),
},
{
"id": "2",
"gen": (
(), {}
),
"setup": (
(), {}
),
"timed": (
(0.2,), {
}
),
}
]
sleep_timings = collect_timings(
gen_dummy,
setup_sleep,
example_run
)
print()
print_ratios(get_ratios(sleep_timings))
Timed run ID: 1 (06.05.2021 16:49:54)
Timed run ID: 2 (06.05.2021 16:49:56)
Run ID: Factor
=======================
1: 1.00
2: 2.00
[26]:
sleep_timings
[26]:
{'1': <TimeitResult : 100 ms ± 2.42 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'2': <TimeitResult : 200 ms ± 3.87 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>}
[27]:
fig, ax = plt.subplots(figsize=(1, 1/1.618))
plot_timings(sleep_timings, ax=ax, id_to_x=lambda x: int(x))

CommonNN Clustering¶
[22]:
report_dir = pathlib.Path("reports/qcm07/cnnclustering_fit")
Clustering of a 2D toy data set (no structure) with increasing number of points while dimensionality, cluster parameters, and cluster recipe are fixed:
Default recipe:
\(d = 2\), \(r = 0.1\), \(c = 0\) : run a
Distance recipe:
\(d = 2\), \(r = 0.1\), \(c = 0\) : run b
Neighbours recipe:
\(d = 2\), \(r = 0.1\), \(c = 0\) : run c
Neighbours recipe:
[29]:
# The test data
fig, ax = plt.subplots(figsize=(1, 1))
ax.scatter(
*helper.gen_no_structure_points((5000, 2)).T,
s=0.5,
)
[29]:
<matplotlib.collections.PathCollection at 0x7f26f1d37a00>

[23]:
n_points_list = [500 * 2**x for x in range(10)]
[24]:
default_recipe = {
"input_data": _types.InputDataExtPointsMemoryview,
"neighbours_getter": _types.NeighboursGetterExtBruteForce,
"neighbours": (_types.NeighboursExtVector, (5000,), {}),
"neighbour_neighbours": (_types.NeighboursExtVector, (5000,), {}),
"metric": _types.MetricExtEuclideanReduced,
"similarity_checker": _types.SimilarityCheckerExtContains,
"queue": _types.QueueExtFIFOQueue,
"fitter": _fit.FitterExtBFS,
}
[25]:
distance_recipe = {
"input_data": _types.InputDataExtPointsMemoryview,
"neighbours_getter": _types.NeighboursGetterExtBruteForce,
"neighbours": (_types.NeighboursExtVector, (5000,), {}),
"neighbour_neighbours": (_types.NeighboursExtVector, (5000,), {}),
"metric": _types.MetricExtPrecomputed,
"similarity_checker": _types.SimilarityCheckerExtContains,
"queue": _types.QueueExtFIFOQueue,
"fitter": _fit.FitterExtBFS,
}
[26]:
neighbours_recipe = {
"input_data": _types.InputDataExtNeighboursMemoryview,
"neighbours_getter": _types.NeighboursGetterExtLookup,
"neighbours": (_types.NeighboursExtVector, (5000,), {}),
"neighbour_neighbours": (_types.NeighboursExtVector, (5000,), {}),
"metric": _types.MetricExtDummy,
"similarity_checker": _types.SimilarityCheckerExtContains,
"queue": _types.QueueExtFIFOQueue,
"fitter": _fit.FitterExtBFS,
}
[27]:
neighbours_sorted_recipe = {
"input_data": _types.InputDataExtNeighboursMemoryview,
"neighbours_getter": _types.NeighboursGetterExtLookup,
"neighbours": (_types.NeighboursExtVector, (5000,), {}),
"neighbour_neighbours": (_types.NeighboursExtVector, (5000,), {}),
"metric": _types.MetricExtDummy,
"similarity_checker": _types.SimilarityCheckerExtScreensorted,
"queue": _types.QueueExtFIFOQueue,
"fitter": _fit.FitterExtBFS,
}
[28]:
make_maps()
[29]:
run_name_list = [
'no_structure_run_a',
'no_structure_run_b',
'no_structure_run_c',
'no_structure_run_c_b',
'no_structure_run_c_c',
'no_structure_run_d',
'no_structure_run_d_b',
'no_structure_run_d_c',
]
for run_name in run_name_list:
report_file = report_dir / f"{run_name}.json"
RUN_TIMINGS_MAP[run_name] = load_report(report_file)
#### Run a
[36]:
run_name = "no_structure_run_a"
report_file = report_dir / f"{run_name}.json"
gen_func = helper.gen_no_structure_points
transform_func = None
setup_func = setup_commonnn_clustering__fit
[37]:
radius_cutoff = 0.1
cnn_cutoff = 0
RUN_ARGUMENTS_MAP[run_name] = []
for n_points in n_points_list:
RUN_ARGUMENTS_MAP[run_name].append(
{
"id": str(n_points),
"gen": (
((n_points, 2),), {}
),
"setup": (
(), {"recipe": default_recipe}
),
"timed": (
(_types.ClusterParameters(radius_cutoff, cnn_cutoff),), {}
),
}
)
[38]:
print(f"Collection for run: {run_name}")
RUN_TIMINGS_MAP[run_name] = {}
collect_timings(
gen_func,
setup_func,
RUN_ARGUMENTS_MAP[run_name],
transform_func=transform_func,
timings=RUN_TIMINGS_MAP[run_name]
)
Collection for run: no_structure_run_a
Timed run ID: 500 (06.05.2021 16:49:56)
Timed run ID: 1000 (06.05.2021 16:49:56)
Timed run ID: 2000 (06.05.2021 16:49:57)
Timed run ID: 4000 (06.05.2021 16:49:58)
Timed run ID: 8000 (06.05.2021 16:50:01)
Timed run ID: 16000 (06.05.2021 16:50:16)
Timed run ID: 32000 (06.05.2021 16:51:13)
Timed run ID: 64000 (06.05.2021 16:55:05)
Timed run ID: 128000 (06.05.2021 17:11:19)
Timed run ID: 256000 (06.05.2021 18:18:45)
[38]:
{'500': <TimeitResult : 2.01 ms ± 259 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'1000': <TimeitResult : 7.54 ms ± 1.68 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'2000': <TimeitResult : 27.5 ms ± 9.73 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'4000': <TimeitResult : 89.4 ms ± 24.7 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'8000': <TimeitResult : 359 ms ± 98.9 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'16000': <TimeitResult : 1.43 s ± 394 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'32000': <TimeitResult : 5.72 s ± 1.57 s per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'64000': <TimeitResult : 23.2 s ± 6.31 s per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'128000': <TimeitResult : 1min 37s ± 29.4 s per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'256000': <TimeitResult : 6min 44s ± 2min 26s per loop (mean ± std. dev. of 10 runs, 1 loop each)>}
[39]:
save_report(RUN_TIMINGS_MAP[run_name], report_file, overwrite=True)
[40]:
print_ratios(get_ratios(RUN_TIMINGS_MAP[run_name]))
Run ID: Factor
=======================
500: 1.00
1000: 3.84
2000: 11.38
4000: 45.04
8000: 180.10
16000: 720.78
32000: 2884.09
64000: 11527.52
128000: 46168.29
256000: 184473.71
#### Run b
[41]:
run_name = "no_structure_run_b"
report_file = report_dir / f"{run_name}.json"
gen_func = helper.gen_no_structure_points
transform_func = pairwise_distances
setup_func = setup_commonnn_clustering__fit
[42]:
radius_cutoff = 0.1
cnn_cutoff = 0
RUN_ARGUMENTS_MAP[run_name] = []
for n_points in [500 * 2**x for x in range(8)]: # Memory error on 128000
RUN_ARGUMENTS_MAP[run_name].append(
{
"id": str(n_points),
"gen": (
((n_points, 2),), {}
),
"setup": (
(), {"recipe": distance_recipe}
),
"timed": (
(_types.ClusterParameters(radius_cutoff, cnn_cutoff),), {}
),
}
)
[43]:
print(f"Collection for run: {run_name}")
RUN_TIMINGS_MAP[run_name] = {}
collect_timings(
gen_func,
setup_func,
RUN_ARGUMENTS_MAP[run_name],
transform_func=transform_func,
timings=RUN_TIMINGS_MAP[run_name]
)
Collection for run: no_structure_run_b
Timed run ID: 500 (06.05.2021 18:18:45)
Timed run ID: 1000 (06.05.2021 18:18:45)
Timed run ID: 2000 (06.05.2021 18:18:46)
Timed run ID: 4000 (06.05.2021 18:18:46)
Timed run ID: 8000 (06.05.2021 18:18:52)
Timed run ID: 16000 (06.05.2021 18:19:06)
Timed run ID: 32000 (06.05.2021 18:20:06)
Timed run ID: 64000 (06.05.2021 18:24:16)
[43]:
{'500': <TimeitResult : 724 µs ± 108 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'1000': <TimeitResult : 2.8 ms ± 629 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'2000': <TimeitResult : 12.7 ms ± 4.81 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'4000': <TimeitResult : 64.1 ms ± 23.3 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'8000': <TimeitResult : 454 ms ± 196 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'16000': <TimeitResult : 1.17 s ± 239 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'32000': <TimeitResult : 4.25 s ± 33.7 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'64000': <TimeitResult : 17.1 s ± 64.7 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>}
[44]:
RUN_TIMINGS_MAP[run_name]["128000"] = "MemoryError"
[45]:
save_report(RUN_TIMINGS_MAP[run_name], report_file, overwrite=True)
[46]:
print_ratios(get_ratios(RUN_TIMINGS_MAP[run_name], which="best"))
Run ID: Factor
=======================
500: 1.00
1000: 3.60
2000: 11.49
4000: 47.19
8000: 367.65
16000: 1498.82
32000: 6146.07
64000: 24953.98
#### Run c
[47]:
run_name = "no_structure_run_c"
report_file = report_dir / f"{run_name}.json"
gen_func = helper.gen_no_structure_points
transform_func = compute_neighbours
setup_func = setup_commonnn_clustering__fit
[48]:
RUN_ARGUMENTS_MAP[run_name] = []
radius_cutoff = 0.1
cnn_cutoff = 0
for n_points in n_points_list:
RUN_ARGUMENTS_MAP[run_name].append(
{
"id": str(n_points),
"gen": (
((n_points, 2),), {}
),
"transform": (
(radius_cutoff,), {}
),
"setup": (
(), {
"preparation_hook": cluster.prepare_neighbourhoods,
"recipe": neighbours_recipe
}
),
"timed": (
(_types.ClusterParameters(radius_cutoff, cnn_cutoff),), {}
),
}
)
[49]:
print(f"Collection for run: {run_name}")
RUN_TIMINGS_MAP[run_name] = {}
collect_timings(
gen_func,
setup_func,
RUN_ARGUMENTS_MAP[run_name],
transform_func=transform_func,
timings=RUN_TIMINGS_MAP[run_name]
)
Collection for run: no_structure_run_c
Timed run ID: 500 (06.05.2021 18:24:17)
Timed run ID: 1000 (06.05.2021 18:24:17)
Timed run ID: 2000 (06.05.2021 18:24:17)
Timed run ID: 4000 (06.05.2021 18:24:18)
Timed run ID: 8000 (06.05.2021 18:24:20)
Timed run ID: 16000 (06.05.2021 18:24:24)
Timed run ID: 32000 (06.05.2021 18:24:32)
Timed run ID: 64000 (06.05.2021 18:24:49)
Timed run ID: 128000 (06.05.2021 18:25:28)
Timed run ID: 256000 (06.05.2021 18:27:06)
[49]:
{'500': <TimeitResult : 83.8 µs ± 32 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'1000': <TimeitResult : 93.1 µs ± 19.6 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'2000': <TimeitResult : 185 µs ± 92.6 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'4000': <TimeitResult : 965 µs ± 577 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'8000': <TimeitResult : 7.92 ms ± 5.55 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'16000': <TimeitResult : 13.9 ms ± 9.04 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'32000': <TimeitResult : 40.7 ms ± 8.03 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'64000': <TimeitResult : 138 ms ± 21.4 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'128000': <TimeitResult : 530 ms ± 70.4 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'256000': <TimeitResult : 2.26 s ± 259 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>}
[50]:
save_report(RUN_TIMINGS_MAP[run_name], report_file, overwrite=True)
[51]:
print_ratios(get_ratios(RUN_TIMINGS_MAP[run_name], which="best"))
Run ID: Factor
=======================
500: 1.00
1000: 1.13
2000: 1.45
4000: 2.68
8000: 26.48
16000: 81.55
32000: 409.02
64000: 1472.91
128000: 5848.36
256000: 25515.87
#### Run c b
[52]:
run_name = "no_structure_run_c_b"
report_file = report_dir / f"{run_name}.json"
gen_func = helper.gen_no_structure_points
transform_func = compute_neighbours
setup_func = setup_commonnn_clustering__fit
[53]:
RUN_ARGUMENTS_MAP[run_name] = []
radius_cutoff = 0.1
cnn_cutoff = 100
for n_points in n_points_list:
RUN_ARGUMENTS_MAP[run_name].append(
{
"id": str(n_points),
"gen": (
((n_points, 2),), {}
),
"transform": (
(radius_cutoff,), {}
),
"setup": (
(), {
"preparation_hook": cluster.prepare_neighbourhoods,
"recipe": neighbours_recipe
}
),
"timed": (
(_types.ClusterParameters(radius_cutoff, cnn_cutoff),), {}
),
}
)
[54]:
print(f"Collection for run: {run_name}")
RUN_TIMINGS_MAP[run_name] = {}
collect_timings(
gen_func,
setup_func,
RUN_ARGUMENTS_MAP[run_name],
transform_func=transform_func,
timings=RUN_TIMINGS_MAP[run_name]
)
Collection for run: no_structure_run_c_b
Timed run ID: 500 (06.05.2021 18:27:06)
Timed run ID: 1000 (06.05.2021 18:27:07)
Timed run ID: 2000 (06.05.2021 18:27:07)
Timed run ID: 4000 (06.05.2021 18:27:08)
Timed run ID: 8000 (06.05.2021 18:27:10)
Timed run ID: 16000 (06.05.2021 18:27:13)
Timed run ID: 32000 (06.05.2021 18:27:21)
Timed run ID: 64000 (06.05.2021 18:30:14)
Timed run ID: 128000 (06.05.2021 18:31:14)
Timed run ID: 256000 (06.05.2021 18:33:43)
[54]:
{'500': <TimeitResult : 73.3 µs ± 7.55 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'1000': <TimeitResult : 82.6 µs ± 2.25 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'2000': <TimeitResult : 106 µs ± 952 ns per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'4000': <TimeitResult : 182 µs ± 3.08 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'8000': <TimeitResult : 481 µs ± 103 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'16000': <TimeitResult : 1.54 ms ± 134 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'32000': <TimeitResult : 24.3 ms ± 26.5 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'64000': <TimeitResult : 15.7 s ± 2.49 s per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'128000': <TimeitResult : 2.54 s ± 409 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'256000': <TimeitResult : 7.16 s ± 329 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>}
[55]:
save_report(RUN_TIMINGS_MAP[run_name], report_file, overwrite=True)
[56]:
print_ratios(get_ratios(RUN_TIMINGS_MAP[run_name], which="best"))
Run ID: Factor
=======================
500: 1.00
1000: 1.14
2000: 1.48
4000: 2.56
8000: 6.30
16000: 21.03
32000: 89.47
128000: 29191.25
256000: 94533.45
64000: 168392.29
#### Run c c
[57]:
run_name = "no_structure_run_c_c"
report_file = report_dir / f"{run_name}.json"
gen_func = helper.gen_no_structure_points
transform_func = compute_neighbours
setup_func = setup_commonnn_clustering__fit
[58]:
RUN_ARGUMENTS_MAP[run_name] = []
radius_cutoff = 0.3
cnn_cutoff = 0
for n_points in n_points_list:
RUN_ARGUMENTS_MAP[run_name].append(
{
"id": str(n_points),
"gen": (
((n_points, 2),), {}
),
"transform": (
(radius_cutoff,), {}
),
"setup": (
(), {
"preparation_hook": cluster.prepare_neighbourhoods,
"recipe": neighbours_recipe
}
),
"timed": (
(_types.ClusterParameters(radius_cutoff, cnn_cutoff),), {}
),
}
)
[59]:
print(f"Collection for run: {run_name}")
RUN_TIMINGS_MAP[run_name] = {}
collect_timings(
gen_func,
setup_func,
RUN_ARGUMENTS_MAP[run_name],
transform_func=transform_func,
timings=RUN_TIMINGS_MAP[run_name]
)
Collection for run: no_structure_run_c_c
Timed run ID: 500 (06.05.2021 18:33:43)
Timed run ID: 1000 (06.05.2021 18:33:44)
Timed run ID: 2000 (06.05.2021 18:33:44)
Timed run ID: 4000 (06.05.2021 18:33:45)
Timed run ID: 8000 (06.05.2021 18:33:47)
Timed run ID: 16000 (06.05.2021 18:33:52)
Timed run ID: 32000 (06.05.2021 18:34:05)
Timed run ID: 64000 (06.05.2021 18:34:46)
Timed run ID: 128000 (06.05.2021 18:37:36)
Timed run ID: 256000 (06.05.2021 18:54:39)
[59]:
{'500': <TimeitResult : 187 µs ± 76.8 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'1000': <TimeitResult : 1.09 ms ± 855 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'2000': <TimeitResult : 1.59 ms ± 722 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'4000': <TimeitResult : 4.27 ms ± 919 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'8000': <TimeitResult : 16.7 ms ± 2.68 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'16000': <TimeitResult : 72 ms ± 9.88 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'32000': <TimeitResult : 355 ms ± 42.4 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'64000': <TimeitResult : 1.84 s ± 189 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'128000': <TimeitResult : 10.8 s ± 892 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'256000': <TimeitResult : 1min 17s ± 4.69 s per loop (mean ± std. dev. of 10 runs, 1 loop each)>}
[60]:
save_report(RUN_TIMINGS_MAP[run_name], report_file, overwrite=True)
[61]:
print_ratios(get_ratios(RUN_TIMINGS_MAP[run_name], which="best"))
Run ID: Factor
=======================
500: 1.00
1000: 3.46
2000: 10.44
4000: 36.08
8000: 155.47
16000: 704.80
32000: 3539.87
64000: 18977.51
128000: 115620.64
256000: 863166.43
#### Run d
[62]:
run_name = "no_structure_run_d"
report_file = report_dir / f"{run_name}.json"
gen_func = helper.gen_no_structure_points
transform_func = compute_neighbours
setup_func = setup_commonnn_clustering__fit
[63]:
RUN_ARGUMENTS_MAP[run_name] = []
radius_cutoff = 0.1
cnn_cutoff = 0
for n_points in n_points_list:
RUN_ARGUMENTS_MAP[run_name].append(
{
"id": str(n_points),
"gen": (
((n_points, 2),), {}
),
"transform": (
(radius_cutoff,), {"sort": True}
),
"setup": (
(), {
"preparation_hook": cluster.prepare_neighbourhoods,
"recipe": neighbours_sorted_recipe
}
),
"timed": (
(_types.ClusterParameters(radius_cutoff, cnn_cutoff),), {}
),
}
)
[64]:
print(f"Collection for run: {run_name}")
RUN_TIMINGS_MAP[run_name] = {}
collect_timings(
gen_func,
setup_func,
RUN_ARGUMENTS_MAP[run_name],
transform_func=transform_func,
timings=RUN_TIMINGS_MAP[run_name]
)
Collection for run: no_structure_run_d
Timed run ID: 500 (06.05.2021 18:54:40)
Timed run ID: 1000 (06.05.2021 18:54:40)
Timed run ID: 2000 (06.05.2021 18:54:41)
Timed run ID: 4000 (06.05.2021 18:54:42)
Timed run ID: 8000 (06.05.2021 18:54:44)
Timed run ID: 16000 (06.05.2021 18:54:48)
Timed run ID: 32000 (06.05.2021 18:54:55)
Timed run ID: 64000 (06.05.2021 18:55:13)
Timed run ID: 128000 (06.05.2021 18:55:51)
Timed run ID: 256000 (06.05.2021 18:57:25)
[64]:
{'500': <TimeitResult : 81.9 µs ± 25.6 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'1000': <TimeitResult : 94.6 µs ± 17.3 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'2000': <TimeitResult : 208 µs ± 126 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'4000': <TimeitResult : 774 µs ± 427 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'8000': <TimeitResult : 5.18 ms ± 3 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'16000': <TimeitResult : 9.47 ms ± 3.72 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'32000': <TimeitResult : 30.8 ms ± 3.11 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'64000': <TimeitResult : 102 ms ± 6.33 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'128000': <TimeitResult : 365 ms ± 12.7 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'256000': <TimeitResult : 1.27 s ± 24.4 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>}
[65]:
previous_timings = load_report(report_file)
for run_id, timing in previous_timings.items():
if run_id in RUN_TIMINGS_MAP[run_name]:
combined = combine_timeit_results(
RUN_TIMINGS_MAP[run_name][run_id], timing
)
else:
combined = timing
RUN_TIMINGS_MAP[run_name][run_id] = combined
[66]:
save_report(RUN_TIMINGS_MAP[run_name], report_file, overwrite=True)
[67]:
print_ratios(get_ratios(RUN_TIMINGS_MAP[run_name], which="best"))
Run ID: Factor
=======================
500: 1.00
1000: 1.14
2000: 1.50
4000: 2.67
8000: 22.84
16000: 80.77
32000: 359.05
64000: 1280.41
128000: 4773.54
256000: 17251.74
#### Run d b
[68]:
run_name = "no_structure_run_d_b"
report_file = report_dir / f"{run_name}.json"
gen_func = helper.gen_no_structure_points
transform_func = compute_neighbours
setup_func = setup_commonnn_clustering__fit
[69]:
RUN_ARGUMENTS_MAP[run_name] = []
radius_cutoff = 0.1
cnn_cutoff = 100
for n_points in n_points_list:
RUN_ARGUMENTS_MAP[run_name].append(
{
"id": str(n_points),
"gen": (
((n_points, 2),), {}
),
"transform": (
(radius_cutoff,), {"sort": True}
),
"setup": (
(), {
"preparation_hook": cluster.prepare_neighbourhoods,
"recipe": neighbours_sorted_recipe
}
),
"timed": (
(_types.ClusterParameters(radius_cutoff, cnn_cutoff),), {}
),
}
)
[70]:
print(f"Collection for run: {run_name}")
RUN_TIMINGS_MAP[run_name] = {}
collect_timings(
gen_func,
setup_func,
RUN_ARGUMENTS_MAP[run_name],
transform_func=transform_func,
timings=RUN_TIMINGS_MAP[run_name]
)
Collection for run: no_structure_run_d_b
Timed run ID: 500 (06.05.2021 18:57:25)
Timed run ID: 1000 (06.05.2021 18:57:25)
Timed run ID: 2000 (06.05.2021 18:57:26)
Timed run ID: 4000 (06.05.2021 18:57:27)
Timed run ID: 8000 (06.05.2021 18:57:28)
Timed run ID: 16000 (06.05.2021 18:57:32)
Timed run ID: 32000 (06.05.2021 18:57:40)
Timed run ID: 64000 (06.05.2021 18:58:22)
Timed run ID: 128000 (06.05.2021 18:59:04)
Timed run ID: 256000 (06.05.2021 19:00:43)
[70]:
{'500': <TimeitResult : 78.8 µs ± 9.26 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'1000': <TimeitResult : 88 µs ± 11 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'2000': <TimeitResult : 109 µs ± 4.72 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'4000': <TimeitResult : 185 µs ± 3.57 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'8000': <TimeitResult : 452 µs ± 9.96 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'16000': <TimeitResult : 1.5 ms ± 14.5 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'32000': <TimeitResult : 11.1 ms ± 6.44 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'64000': <TimeitResult : 2.65 s ± 443 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'128000': <TimeitResult : 625 ms ± 37.4 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'256000': <TimeitResult : 1.71 s ± 26.1 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>}
[71]:
save_report(RUN_TIMINGS_MAP[run_name], report_file, overwrite=True)
[72]:
print_ratios(get_ratios(RUN_TIMINGS_MAP[run_name], which="best"))
Run ID: Factor
=======================
500: 1.00
1000: 1.10
2000: 1.43
4000: 2.43
8000: 5.96
16000: 19.97
32000: 87.27
128000: 7739.12
256000: 22501.85
64000: 26586.38
#### Run d c
[73]:
run_name = "no_structure_run_d_c"
report_file = report_dir / f"{run_name}.json"
gen_func = helper.gen_no_structure_points
transform_func = compute_neighbours
setup_func = setup_commonnn_clustering__fit
[74]:
RUN_ARGUMENTS_MAP[run_name] = []
radius_cutoff = 0.3
cnn_cutoff = 0
for n_points in n_points_list:
RUN_ARGUMENTS_MAP[run_name].append(
{
"id": str(n_points),
"gen": (
((n_points, 2),), {}
),
"transform": (
(radius_cutoff,), {"sort": True}
),
"setup": (
(), {
"preparation_hook": cluster.prepare_neighbourhoods,
"recipe": neighbours_sorted_recipe
}
),
"timed": (
(_types.ClusterParameters(radius_cutoff, cnn_cutoff),), {}
),
}
)
[75]:
print(f"Collection for run: {run_name}")
RUN_TIMINGS_MAP[run_name] = {}
collect_timings(
gen_func,
setup_func,
RUN_ARGUMENTS_MAP[run_name],
transform_func=transform_func,
timings=RUN_TIMINGS_MAP[run_name]
)
Collection for run: no_structure_run_d_c
Timed run ID: 500 (06.05.2021 19:00:43)
Timed run ID: 1000 (06.05.2021 19:00:43)
Timed run ID: 2000 (06.05.2021 19:00:44)
Timed run ID: 4000 (06.05.2021 19:00:45)
Timed run ID: 8000 (06.05.2021 19:00:47)
Timed run ID: 16000 (06.05.2021 19:00:52)
Timed run ID: 32000 (06.05.2021 19:01:03)
Timed run ID: 64000 (06.05.2021 19:01:34)
Timed run ID: 128000 (06.05.2021 19:03:09)
Timed run ID: 256000 (06.05.2021 19:08:48)
[75]:
{'500': <TimeitResult : 163 µs ± 58.9 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'1000': <TimeitResult : 747 µs ± 484 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'2000': <TimeitResult : 1.08 ms ± 309 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'4000': <TimeitResult : 2.97 ms ± 398 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'8000': <TimeitResult : 10.1 ms ± 666 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'16000': <TimeitResult : 39.4 ms ± 1.34 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'32000': <TimeitResult : 150 ms ± 3.93 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'64000': <TimeitResult : 566 ms ± 6.94 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'128000': <TimeitResult : 2.2 s ± 10.9 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>,
'256000': <TimeitResult : 8.99 s ± 26.7 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)>}
[76]:
save_report(RUN_TIMINGS_MAP[run_name], report_file, overwrite=True)
[77]:
print_ratios(get_ratios(RUN_TIMINGS_MAP[run_name], which="best"))
Run ID: Factor
=======================
500: 1.00
1000: 3.58
2000: 8.80
4000: 30.28
8000: 113.11
16000: 464.12
32000: 1816.65
64000: 6954.94
128000: 27366.46
256000: 112138.82
Plots¶
[30]:
figsrc_dir = pathlib.Path(os.path.expandvars("$WD/CommonNN/Manuscript/figsrc/Benchmark"))
[81]:
mpl.rcParams["font.size"] = 8
mpl.rcParams["axes.labelpad"] = 2
mpl.rcParams["xtick.major.pad"] = 2
mpl.rcParams["xtick.minor.pad"] = 1.9
mpl.rcParams["ytick.major.pad"] = 2
mpl.rcParams["ytick.minor.pad"] = 1.9
fig, (normal_ax, log_ax) = plt.subplots(2, 1, figsize=(3.33, 3.33/1.618))
run_name_list = [
('no_structure_run_a', 'points'),
('no_structure_run_b', 'distances'),
('no_structure_run_c', 'neighbours'),
('no_structure_run_d', 'sorted n.'),
]
markers = iter(["o", "v", "^", "s", "p", "P", "*", "h", "d"])
normal_ax.set_position([0.13, 0.51, 0.84, 0.40])
log_ax.set_position([0.13, 0.09, 0.84, 0.40])
# Inset
inset_ax = fig.add_axes(
[0.125, 0.66, 0.39, 0.26],
zorder=3,
)
for name, label in run_name_list:
marker = next(markers)
plot_timings(
RUN_TIMINGS_MAP[name],
ax=normal_ax,
id_to_x=lambda x: int(x),
sort_ids=True,
set_ax_props=False,
plot_props={
"label": label,
"marker": marker,
"markersize": 4
}
)
line = normal_ax.lines[-1]
log_line, = log_ax.plot(*line.get_xydata().T)
saved_transform = log_line.get_transform()
saved_clipbox = log_line.clipbox
log_line.update_from(line)
log_line.set_transform(saved_transform)
log_line.clipbox = saved_clipbox
line = normal_ax.lines[-1]
inset_line, = inset_ax.plot(*line.get_xydata().T)
saved_transform = inset_line.get_transform()
saved_clipbox = inset_line.clipbox
inset_line.update_from(line)
inset_line.set_transform(saved_transform)
inset_line.clipbox = saved_clipbox
inset_line.set_markersize(3)
normal_ax.xaxis.tick_top()
normal_ax.set(**{
"xlabel": None,
"xlabel": None
})
log_ax.set(**{
"xlabel": None,
"xlabel": None,
"xscale": "log",
"yscale": "log",
})
inset_ax.set(**{
"xlim": (400, 9000),
"ylim": (-0.0005, 0.01),
"xticks": (),
"yticks": (),
})
mark_inset(normal_ax, inset_ax, loc1=3, loc2=4, edgecolor="k")
# Annotate memory error
annotate_memory_error(
ax=normal_ax, line=normal_ax.get_lines()[1],
memory_error_id=128000
)
annotate_memory_error(
ax=log_ax, line=log_ax.get_lines()[1],
memory_error_id=128000
)
commonax = fig.add_axes(
[0.04, 0., 0.97, 1],
zorder=-1,
frame_on=False
)
commonax.set(**{
"xticks": (),
"yticks": ()
})
commonax.set_ylabel("time / s", labelpad=0)
# Legend
legend = normal_ax.legend(
fancybox=False,
framealpha=1,
edgecolor="k",
fontsize="xx-small",
loc=(0.80, 0.34)
)
legend.get_frame().set_linewidth(0.5)
log_ax.get_xaxis().set_major_formatter(mpl.ticker.ScalarFormatter())
# fig.subplots_adjust(left=0.11, bottom=0.1, right=0.86, top=0.89, hspace=0.1)
fig.savefig(figsrc_dir / "bm_cnnclustering_fit_no_structure_a_b_c_d.png")

[80]:
mpl.rcParams["font.size"] = 8
mpl.rcParams["axes.labelpad"] = 2
mpl.rcParams["xtick.major.pad"] = 2
mpl.rcParams["xtick.minor.pad"] = 1.9
mpl.rcParams["ytick.major.pad"] = 2
mpl.rcParams["ytick.minor.pad"] = 1.9
fig, (normal_ax, log_ax) = plt.subplots(2, 1, figsize=(3.33, 3.33/1.618))
run_name_list = [
('no_structure_run_c', '0.1, 0'),
('no_structure_run_c_b', '0.1, 100'),
('no_structure_run_c_c', '0.3, 0'),
('no_structure_run_d', '0.1, 0, sorted'),
('no_structure_run_d_b', '0.1, 100, sorted'),
('no_structure_run_d_c', '0.3, 0, sorted'),
]
markers = iter(["o", "v", "^", "s", "p", "P", "*", "h", "d"])
normal_ax.set_position([0.13, 0.51, 0.84, 0.40])
log_ax.set_position([0.13, 0.09, 0.84, 0.40])
# Inset
inset_ax = fig.add_axes(
[0.125, 0.66, 0.39, 0.26],
zorder=3,
)
for name, label in run_name_list:
marker = next(markers)
plot_timings(
RUN_TIMINGS_MAP[name],
ax=normal_ax,
id_to_x=lambda x: int(x),
sort_ids=True,
set_ax_props=False,
plot_props={
"label": label,
"marker": marker,
"markersize": 4
}
)
line = normal_ax.lines[-1]
log_line, = log_ax.plot(*line.get_xydata().T)
saved_transform = log_line.get_transform()
saved_clipbox = log_line.clipbox
log_line.update_from(line)
log_line.set_transform(saved_transform)
log_line.clipbox = saved_clipbox
line = normal_ax.lines[-1]
inset_line, = inset_ax.plot(*line.get_xydata().T)
saved_transform = inset_line.get_transform()
saved_clipbox = inset_line.clipbox
inset_line.update_from(line)
inset_line.set_transform(saved_transform)
inset_line.clipbox = saved_clipbox
inset_line.set_markersize(3)
normal_ax.xaxis.tick_top()
normal_ax.set(**{
"xlabel": None,
"xlabel": None
})
log_ax.set(**{
"xlabel": None,
"xlabel": None,
"xscale": "log",
"yscale": "log",
})
inset_ax.set(**{
"xlim": (400, 9000),
"ylim": (-0.0005, 0.005),
"xticks": (),
"yticks": (),
})
mark_inset(normal_ax, inset_ax, loc1=3, loc2=4, edgecolor="k")
commonax = fig.add_axes(
[0.04, 0., 0.97, 1],
zorder=-1,
frame_on=False
)
commonax.set(**{
"xticks": (),
"yticks": ()
})
commonax.set_ylabel("time / s", labelpad=0)
# Legend
legend = normal_ax.legend(
fancybox=False,
framealpha=1,
edgecolor="k",
fontsize="xx-small",
loc=(0.8, 0.33)
)
legend.get_frame().set_linewidth(0.5)
log_ax.get_xaxis().set_major_formatter(mpl.ticker.ScalarFormatter())
# fig.subplots_adjust(left=0.11, bottom=0.1, right=0.86, top=0.89, hspace=0.1)
fig.savefig(figsrc_dir / "bm_cnnclustering_fit_no_structure_c_cb_cc_d_db_dc.png")
