Table of Contents

  • 1  Helper function definitions

    • 1.1  Plotting

    • 1.2  Organisation

  • 2  No structure

    • 2.1  Checks

    • 2.2  Scan

    • 2.3  Plots

      • 2.3.1  Time

        • 2.3.1.1  Trends

      • 2.3.2  Noise level

      • 2.3.3  Largest cluster

      • 2.3.4  Number of clusters

  • 3  Mono blob

    • 3.1  Checks

  • 4  Di blob

    • 4.1  Checks

  • 5  Quadro blob

    • 5.1  Checks

  • 6  Blobs

    • 6.1  Checks

    • 6.2  Scan

    • 6.3  Plots

      • 6.3.1  Time

      • 6.3.2  Noise level

      • 6.3.3  Largest cluster

      • 6.3.4  Number of clusters

  • 7  Moons

    • 7.1  Checks

    • 7.2  Scan

    • 7.3  Plots

      • 7.3.1  Time

      • 7.3.2  Noise level

      • 7.3.3  Largest cluster

      • 7.3.4  Number of clusters

  • 8  Mono circle

    • 8.1  Checks

  • 9  Circles

    • 9.1  Checks

    • 9.2  Scan

    • 9.3  Plots

      • 9.3.1  Time

      • 9.3.2  Noise level

      • 9.3.3  Largest cluster

      • 9.3.4  Number of clusters

  • 10  Varied

    • 10.1  Checks

    • 10.2  Scan

    • 10.3  Plots

      • 10.3.1  Time

      • 10.3.2  Noise level

      • 10.3.3  Largest cluster

      • 10.3.4  Number of clusters

      • 10.3.5  Evaluation

  • 11  Aniso

    • 11.1  Checks

    • 11.2  Scan

    • 11.3  Plots

      • 11.3.1  Time

      • 11.3.2  Noise level

      • 11.3.3  Largest cluster

      • 11.3.4  Number of clusters

      • 11.3.5  Evaluation

  • 12  Backbone dihedrals

    • 12.1  Checks

    • 12.2  Scan

    • 12.3  Plots

      • 12.3.1  Time

      • 12.3.2  Noise level

      • 12.3.3  Largest cluster

      • 12.3.4  Number of clusters

      • 12.3.5  Evaluation

        • 12.3.5.1  Hierarchical manual

  • 13  Peptide (TICA)

    • 13.1  Checks

    • 13.2  Scan

    • 13.3  Plots

      • 13.3.1  Time

      • 13.3.2  Noise level

      • 13.3.3  Largest cluster

      • 13.3.4  Number of clusters

      • 13.3.5  Evaluation

      • 13.3.6  Hierarchical semi-automatic

  • 14  Langerin (PCA)

    • 14.1  Checks

Cluster parameter scans

Remember to compile cnnclustering with TRACE_CYTHON=0 if timings should be measured.

[1]:
from collections import defaultdict
import itertools
import json

import cnnclustering
from cnnclustering import cluster, plot
from cnnclustering import _fit, _primitive_types, _types
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import datasets
from sklearn.metrics import pairwise_distances
from sklearn.neighbors import KDTree
from sklearn.preprocessing import StandardScaler
from tqdm.notebook import tqdm

import helper
[2]:
import importlib
importlib.reload(helper)
[2]:
<module 'helper' from '/home/janjoswig/repo/CommonNNClustering/docsrc/benchmark/helper.py'>
[229]:
mpl.rcParams["figure.dpi"] = 300
mpl.rcParams["font.size"] = 6
mpl.rcParams["figure.figsize"] = (2, 2/1.618)
mpl.rcParams["axes.titlepad"] = 1
[4]:
print(f"{'Python':>20} :  ", *sys.version.splitlines())
              Python :   3.8.8 (default, Mar 11 2021, 08:58:19)  [GCC 8.3.0]
[5]:
print(f"cnnclustering :  ", cnnclustering.__version__)
cnnclustering :   0.4.2

Helper function definitions

[6]:
neighbours_sorted_recipe = {
    "input_data": _types.InputDataExtNeighboursMemoryview,
    "neighbours_getter": _types.NeighboursGetterExtLookup,
    "neighbours": (_types.NeighboursExtVector, (5000,), {}),
    "neighbour_neighbours": (_types.NeighboursExtVector, (5000,), {}),
    "metric": _types.MetricExtDummy,
    "similarity_checker": _types.SimilarityCheckerExtScreensorted,
    "queue": _types.QueueExtFIFOQueue,
    "fitter": _fit.FitterExtBFS,
}
[7]:
neighbours_sorted_alternative_recipe = {
    "input_data": _types.InputDataPointsSklearnKDTree,
    "neighbours_getter": (_types.NeighboursGetterRecomputeLookup, (), {"is_sorted": True}),
    "neighbours": (_types.NeighboursExtVector, (5000,), {}),
    "neighbour_neighbours": (_types.NeighboursExtVector, (5000,), {}),
    "metric": _types.MetricExtDummy,
    "similarity_checker": _types.SimilarityCheckerExtScreensorted,
    "queue": _types.QueueExtFIFOQueue,
    "fitter": _fit.FitterBFS,
}
[8]:
if "CLUSTERING_MAP" not in dir():
    CLUSTERING_MAP = {}
[377]:
CLUSTERING_MAP
[377]:
{'backbone': Clustering(input_data=<cnnclustering._types.InputDataExtPointsMemoryview object at 0x7fbbfe08a150>, neighbours_getter=<cnnclustering._types.NeighboursGetterExtBruteForce object at 0x7fbbcb6e9750>, neighbours=<cnnclustering._types.NeighboursExtVector object at 0x7fbbcaeb72f0>, neighbour_neighbours=<cnnclustering._types.NeighboursExtVector object at 0x7fbbcaeb7430>, metric=<cnnclustering._types.MetricExtEuclidean object at 0x7fbbcb6e9e70>, similarity_checker=<cnnclustering._types.SimilarityCheckerExtContains object at 0x7fbbcb728ad0>, queue=<cnnclustering._types.QueueExtFIFOQueue object at 0x7fbbcafc96c0>, fitter=<cnnclustering._fit.FitterExtBFS object at 0x7fbbcb728a30>, predictor=None)}
[9]:
overwrite = False

record_file = pathlib.Path("records/records.json")
if record_file.is_file() and not overwrite:
    raise RuntimeError(f"File exists: str(record_file)")

with open(record_file, "w") as fp:
    json.dump({k: v.summary._list for k, v in CLUSTERING_MAP.items()}, fp, cls=helper.RecordEncoder, indent=4)
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-9-2d943096d177> in <module>
      2 record_file = pathlib.Path("records/records.json")
      3 if record_file.is_file() and not overwrite:
----> 4     raise RuntimeError(f"File exists: str(record_file)")
      5
      6 with open(record_file, "w") as fp:

RuntimeError: File exists: str(record_file)
[376]:
with open("records/records.json", "r") as fp:
    RECORD_MAP = json.load(fp, object_hook=helper.as_Record)
[217]:
overwrite = False
record_file = pathlib.Path("records/records.json")
if record_file.is_file() and not overwrite:
    raise RuntimeError(f"File exists: str(record_file)")

with open(record_file, "w") as fp:
    json.dump(RECORD_MAP, fp, cls=helper.RecordEncoder, indent=4)
[396]:
for scan_name, cl in CLUSTERING_MAP.items():
    print(scan_name, cl.summary[-1])
backbone -----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
10020     1.000     1999      20        None      2         0.642     0.149     00:00:0.837
-----------------------------------------------------------------------------------------------

[216]:
for scan_name, recs in RECORD_MAP.items():
    print(scan_name, recs[-1])
backbone -----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
10020     1.000     250       20        None      2         0.962     0.005     00:00:0.233
-----------------------------------------------------------------------------------------------

no_structure -----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
5000      1.000     150       20        None      1         1.000     0.000     00:00:0.039
-----------------------------------------------------------------------------------------------

blobs -----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
5000      0.600     150       20        None      2         0.667     0.000     00:00:0.055
-----------------------------------------------------------------------------------------------

moons -----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
5000      0.600     150       20        None      2         0.500     0.000     00:00:0.062
-----------------------------------------------------------------------------------------------

varied -----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
5000      0.500     150       20        None      1         0.923     0.077     00:00:0.077
-----------------------------------------------------------------------------------------------

aniso -----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
5000      0.600     150       20        None      1         0.999     0.001     00:00:0.059
-----------------------------------------------------------------------------------------------

tica -----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
37500     1.000     250       20        None      4         0.828     0.005     00:00:1.761
-----------------------------------------------------------------------------------------------

Plotting

[10]:
def calc_histogram(x, hist_props=None, save=False, base_name=None):

    hist_props_defaults = {
        "bins": 100,
        "density": True,
        }

    if hist_props is not None:
        hist_props_defaults.update(hist_props)

    histogram, bins = np.histogram(
        x,
        **hist_props_defaults
        )
    binmids = 0.5 * (bins[:-1] + bins[1:])

    if save:
        if base_name is None:
            raise ValueError("If 'save=True', need to provide 'basename'")

        np.save(f"{base_name}_hist.npy", histogram)
        np.save(f"{base_name}_binmids.npy", binmids)

    return histogram, binmids
[373]:
def calc_n_neighbours_per_radius(
        distances,
        points_factor=1,
        highlights=None,
        save=False,
        base_name=None,
        hist_props=None):

    if hist_props is None:
        hist_props = {}

    min_n = []
    max_n = []
    mean_n = []

    upper_dist_bound = np.ceil(distances.max())
    r_array = np.linspace(
        0,
        upper_dist_bound,
        int(f"{upper_dist_bound:1.0e}".split("e")[0]) * 10 + 1
    )

    if highlights is not None:
        for highlight in highlights:
            if highlight in r_array:
                np.delete(highlights, highlight)

        insert_highlights_at = np.searchsorted(r_array, highlights)
        r_array = np.insert(r_array, insert_highlights_at, highlights)

    highlights_distributions = {}
    for r in r_array:
        adjecancy = np.where(distances < r, 1, 0)
        n_neighbours = np.sum(adjecancy, axis=0) - 1
        n_neighbours *= points_factor
        min_n.append(n_neighbours.min())
        max_n.append(n_neighbours.max())
        mean_n.append(n_neighbours.mean())
        if r in highlights:
            hist_props["range"] = (min_n[-1], max_n[-1])
            h, binmids = calc_histogram(n_neighbours, hist_props=hist_props, save=False)
            highlights_distributions[r] = (h, binmids)

    min_n = np.asarray(min_n)
    max_n = np.asarray(max_n)
    mean_n = np.asarray(mean_n)

    if save:
        if base_name is None:
            raise ValueError("If 'save=True', need to provide 'basename'")

        np.save(f"{base_name}_r_array.npy", r_array)
        np.save(f"{base_name}_min_n.npy", min_n)
        np.save(f"{base_name}_max_n.npy", max_n)
        np.save(f"{base_name}_mean_n.npy", mean_n)
        np.save(f"{base_name}_highlights_dist.npy", highlights_distributions)

    return r_array, min_n, max_n, mean_n, highlights_distributions
[12]:
def plot_n_neighbours_per_radius(r_array, min_n, max_n, mean_n, dist_hist_cutoff, n_points):

    fig, (full_ax, zoom_ax) = plt.subplots(
        1, 2,
        figsize=(
            mpl.rcParams["figure.figsize"][0] * 1.5,
            mpl.rcParams["figure.figsize"][1]
        )
    )

    full_ax.plot(r_array, min_n)
    full_ax.plot(r_array, max_n)
    full_ax.plot(r_array, mean_n)

    full_ax.axvline(dist_hist_cutoff, color="k", linestyle="--", linewidth=0.5)
    intersection_index = np.argwhere(r_array < dist_hist_cutoff)[-1][0]
    intersection_ticks = []
    for intersection_list in [min_n, max_n, mean_n]:
        intersection_y = (
            (intersection_list[intersection_index] + intersection_list[intersection_index + 1])
            / 2
            )
        intersection_ticks.append(intersection_y)
        full_ax.hlines(
            y=intersection_y,
            xmin=0, xmax=dist_hist_cutoff,
            color="k", linestyle="--", linewidth=0.5,
            transform=full_ax.transData,
            zorder=3
        )
    full_ax.set_yticks((0, n_points))
    full_ax.set_yticks(intersection_ticks, minor=True)
    full_ax.set_yticklabels(
        [int(t) for t in intersection_ticks],
        minor=True,
        fontsize="x-small"
        )
    full_ax.set_xlim(0, r_array[-1])
    full_ax.set_xlabel("$r$")
    full_ax.set_ylabel("# neighbours")

    zoom_ax.plot(r_array, min_n)
    zoom_ax.plot(r_array, max_n)
    zoom_ax.plot(r_array, mean_n)
    zoom_ax.set_xlim(0, dist_hist_cutoff)
    zoom_ax.set_ylim(0, np.ceil(intersection_ticks[2]))
    zoom_ax.set_xlabel("$r$")
    zoom_ax.legend(["min", "max", "mean"])
    plt.tight_layout(pad=0.1, w_pad=1)

Organisation

Ignore/modify these if needed.

[13]:
# Find data sets in ...
data_dir = pathlib.Path(os.path.expandvars("$WD/CommonNN"))
[14]:
# Save observations to ...
figsrc_dir = pathlib.Path(data_dir / "Manuscript/figsrc/Scans")

No structure

[75]:
scan_name = "no_structure"
[116]:
data = helper.gen_no_structure_points((5000, 2))
distances = pairwise_distances(data)
n_points = data.shape[0]

fig, ax = plt.subplots(figsize=(1, 1))
ax.scatter(
    *data.T,
    s=0.5,
)
[116]:
<matplotlib.collections.PathCollection at 0x7fc668d36220>
../_images/benchmark_scans_28_1.png

Checks

[78]:
fig, ax = plt.subplots()
_ = plot.plot_histogram(ax, distances.flatten(), maxima=True, maxima_props={"order": 5})
../_images/benchmark_scans_30_0.png
[110]:
h, binmids = calc_histogram(distances.flatten(), save=True, base_name=figsrc_dir / f"{scan_name}_dist")
[82]:
print("Min: ", distances[distances > 0].min())
print("Max: ", distances.max())
Min:  0.00027307491487952
Max:  4.842668307314774
[118]:
r_array, min_n, max_n, mean_n = calc_n_neighbours_per_radius(
    distances, save=True, base_name=figsrc_dir / f"{scan_name}"
    )
[139]:
plot_n_neighbours_per_radius(
    r_array, min_n, max_n, mean_n,
    1.67, n_points
)
../_images/benchmark_scans_34_0.png

Scan

[129]:
scan_name = "no_structure"
clustering = CLUSTERING_MAP[scan_name] = cluster.prepare_clustering(data)
[62]:
for record in RECORD_MAP[scan_name]:
    clustering.summary.append(record)
[130]:
for r in tqdm(np.arange(0.01, 1.01, 0.01)):
    neighbours = helper.compute_neighbours(data, r, sort=True)
    neighbours_clustering = cluster.prepare_clustering(
        neighbours,
        preparation_hook=cluster.prepare_neighbourhoods,
        **neighbours_sorted_recipe
    )

    for c in range(151):
        neighbours_clustering.fit(r, c, member_cutoff=20, v=False)
        clustering.summary.append(neighbours_clustering.summary[-1])
[131]:
clustering.summary.to_DataFrame()
[131]:
n_points radius_cutoff cnn_cutoff member_cutoff max_clusters n_clusters ratio_largest ratio_noise execution_time
0 5000 0.01 0 20 <NA> 0 0.0 1.0 0.000189
1 5000 0.01 1 20 <NA> 0 0.0 1.0 0.000126
2 5000 0.01 2 20 <NA> 0 0.0 1.0 0.000124
3 5000 0.01 3 20 <NA> 0 0.0 1.0 0.000122
4 5000 0.01 4 20 <NA> 0 0.0 1.0 0.000124
... ... ... ... ... ... ... ... ... ...
15095 5000 1.00 146 20 <NA> 1 1.0 0.0 0.038833
15096 5000 1.00 147 20 <NA> 1 1.0 0.0 0.039439
15097 5000 1.00 148 20 <NA> 1 1.0 0.0 0.038678
15098 5000 1.00 149 20 <NA> 1 1.0 0.0 0.039222
15099 5000 1.00 150 20 <NA> 1 1.0 0.0 0.039001

15100 rows × 9 columns

Plots

[62]:
scan_name = "no_structure"
clustering = CLUSTERING_MAP[scan_name]

Time

[133]:
fig, ax = plt.subplots()
contour = clustering.summarize(
    ax=ax,
    quantity="execution_time",
    contour_props={
        "levels": 50,
        # "locator": mpl.ticker.LogLocator()
    }
)[2][0]
colorbar = fig.colorbar(mappable=contour, ax=ax)
colorbar.set_label("time / s")

ax.set_xlim(0.01, 1)
ax.set_ylim(0, 150)
[133]:
(0.0, 150.0)
../_images/benchmark_scans_43_1.png

Noise level

[135]:
# Noise level
fig, ax = plt.subplots()
contour = clustering.summarize(
    ax=ax, quantity="ratio_noise",
    contour_props={
        "levels": 100,
        "vmin": 0,
        "vmax": 1,
    },
    # convert=lambda x: x * 100
)[2][0]
colorbar = fig.colorbar(mappable=contour, ax=ax, ticks=(0, 0.5, 1))
colorbar.set_label("noise / %")

ax.set_xlim(0.01, 1)
ax.set_ylim(0, 150)
[135]:
(0.0, 150.0)
../_images/benchmark_scans_46_1.png

Largest cluster

[136]:
# Largest cluster
fig, ax = plt.subplots()
contour = clustering.summarize(
    ax=ax,
    quantity="ratio_largest",
    contour_props={
        "levels": 100,
        "vmin": 0,
        "vmax": 1,
    }
)[2][0]
colorbar = fig.colorbar(mappable=contour, ax=ax, ticks=(0, 0.5, 1))
colorbar.set_label("largest / %")

ax.set_xlim(0.01, 1)
ax.set_ylim(0, 150)
[136]:
(0.0, 150.0)
../_images/benchmark_scans_48_1.png

Number of clusters

[137]:
# Number of clusters
show_n = 10

fig, ax = plt.subplots()
contour = clustering.summarize(
    ax=ax,
    quantity="n_clusters",
    contour_props={
        "levels": np.arange(-0.5, show_n + 1.5, 1),
        "vmin": 0,
        "vmax": show_n,
        "extend": "max"
    }
)[2][0]
colorbar = fig.colorbar(mappable=contour, ax=ax, ticks=range(0, show_n + 1, 2))
colorbar.set_label("# clusters")

ax.set_xlim(0.01, 1)
ax.set_ylim(0, 150)
[137]:
(0.0, 150.0)
../_images/benchmark_scans_50_1.png

Mono blob

[167]:
scan_name = "mono_blob"
[310]:
# The test data
data = helper.gen_blobs_points(
    (6000, 2),
    shuffle=False
)
data = data[:2000]
distances = pairwise_distances(data)
n_points = data.shape[0]

fig, ax = plt.subplots(figsize=(1, 1))
ax.scatter(
    *data.T,
    s=0.5,
)
[310]:
<matplotlib.collections.PathCollection at 0x7fbbc5567730>
../_images/benchmark_scans_53_1.png

Checks

[311]:
fig, ax = plt.subplots()
_ = plot.plot_histogram(ax, distances.flatten(), maxima=True, maxima_props={"order": 5})
../_images/benchmark_scans_55_0.png

Di blob

[167]:
scan_name = "di_blob"
[312]:
# The test data
data = helper.gen_blobs_points(
    (6000, 2),
    shuffle=False
)
data = data[2000:]
distances = pairwise_distances(data)
n_points = data.shape[0]

fig, ax = plt.subplots(figsize=(1, 1))
ax.scatter(
    *data.T,
    s=0.5,
)
[312]:
<matplotlib.collections.PathCollection at 0x7fbc00a7e370>
../_images/benchmark_scans_58_1.png

Checks

[313]:
fig, ax = plt.subplots()
_ = plot.plot_histogram(ax, distances.flatten(), maxima=True, maxima_props={"order": 5})
../_images/benchmark_scans_60_0.png

Quadro blob

[167]:
scan_name = "quad_blob"
[349]:
# The test data
data = helper.gen_blobs_points(
    (6000, 2),
    centers=np.array([[-5, -5], [-9, 8], [4, -5], [15, 12]]),
    shuffle=False
)
distances = pairwise_distances(data)
n_points = data.shape[0]

fig, ax = plt.subplots(figsize=(1, 1))
ax.scatter(
    *data.T,
    s=0.5,
)
[349]:
<matplotlib.collections.PathCollection at 0x7fbbc7802d00>
../_images/benchmark_scans_63_1.png

Checks

[350]:
fig, ax = plt.subplots()
_ = plot.plot_histogram(ax, distances.flatten(), maxima=True, maxima_props={"order": 5})
../_images/benchmark_scans_65_0.png

Blobs

[337]:
scan_name = "blobs"
[338]:
# The test data
data = helper.gen_blobs_points((5000, 2))
distances = pairwise_distances(data)
n_points = data.shape[0]

fig, ax = plt.subplots(figsize=(1, 1))
ax.scatter(
    *data.T,
    s=0.5,
)
[338]:
<matplotlib.collections.PathCollection at 0x7fbbc5870970>
../_images/benchmark_scans_68_1.png

Checks

[169]:
fig, ax = plt.subplots()
_ = plot.plot_histogram(ax, distances.flatten(), maxima=True, maxima_props={"order": 5})
../_images/benchmark_scans_70_0.png
[170]:
h, binmids = calc_histogram(distances.flatten(), save=True, base_name=figsrc_dir / f"{scan_name}_dist")
[171]:
print("Min: ", distances[distances > 0].min())
print("Max: ", distances.max())
Min:  4.422092256111229e-05
Max:  4.097490617525474
[172]:
r_array, min_n, max_n, mean_n = calc_n_neighbours_per_radius(
    distances, save=True, base_name=figsrc_dir / f"{scan_name}"
    )
[173]:
plot_n_neighbours_per_radius(
    r_array, min_n, max_n, mean_n,
    0.18, n_points
)
../_images/benchmark_scans_74_0.png

Scan

[151]:
scan_name = "blobs"
clustering = CLUSTERING_MAP[scan_name] = cluster.prepare_clustering(data)
[152]:
for r in tqdm(np.arange(0.01, 0.61, 0.01)):
    neighbours = helper.compute_neighbours(data, r, sort=True)
    neighbours_clustering = cluster.prepare_clustering(
        neighbours,
        preparation_hook=cluster.prepare_neighbourhoods,
        **neighbours_sorted_recipe
    )

    for c in range(151):
        neighbours_clustering.fit(r, c, member_cutoff=20, v=False)
        clustering.summary.append(neighbours_clustering.summary[-1])

Plots

[166]:
scan_name = "blobs"
clustering = CLUSTERING_MAP[scan_name]

Time

[153]:
fig, ax = plt.subplots()
contour = clustering.summarize(
    ax=ax,
    quantity="execution_time",
    contour_props={
        "levels": 50,
    }
)[2][0]
colorbar = fig.colorbar(mappable=contour, ax=ax)
colorbar.set_label("time / s")

ax.set_xlim(0.01, 0.6)
ax.set_ylim(0, 150)
[153]:
(0.0, 150.0)
../_images/benchmark_scans_81_1.png

Noise level

[154]:
# Noise level
fig, ax = plt.subplots()
contour = clustering.summarize(
    ax=ax,
    quantity="ratio_noise",
    contour_props={
        "levels": np.arange(0, 1.01, 0.01),
        "vmin": 0,
        "vmax": 1,
    }
)[2][0]
colorbar = fig.colorbar(mappable=contour, ax=ax, ticks=(0, 0.5, 1))
colorbar.set_label("noise / %")

ax.set_xlim(0.01, 0.6)
ax.set_ylim(0, 150)
[154]:
(0.0, 150.0)
../_images/benchmark_scans_83_1.png

Largest cluster

[155]:
# Largest cluster
fig, ax = plt.subplots()
contour = clustering.summarize(
    ax=ax,
    quantity="ratio_largest",
    contour_props={
        "levels": np.arange(0, 1.01, 0.01),
        "vmin": 0,
        "vmax": 1,
    }
)[2][0]
colorbar = fig.colorbar(mappable=contour, ax=ax, ticks=(0, 0.5, 1))
colorbar.set_label("largest / %")

ax.set_xlim(0.01, 0.6)
ax.set_ylim(0, 150)
[155]:
(0.0, 150.0)
../_images/benchmark_scans_85_1.png

Number of clusters

[156]:
# Number of clusters
show_n = 10

fig, ax = plt.subplots()
contour = clustering.summarize(
    ax=ax,
    quantity="n_clusters",
    contour_props={
        "levels": np.arange(-0.5, show_n + 1.5, 1),
        "vmin": 0,
        "vmax": show_n,
        "extend": "max"
    }
)[2][0]
colorbar = fig.colorbar(mappable=contour, ax=ax, ticks=range(0, show_n + 1, 2))
colorbar.set_label("# clusters")

ax.set_xlim(0.01, 0.6)
ax.set_ylim(0, 150)
[156]:
(0.0, 150.0)
../_images/benchmark_scans_87_1.png

Moons

[174]:
scan_name = "moons"
[175]:
# The test data
data = helper.gen_moons_points(
    (5000, 2),
    noise=.05
)
distances = pairwise_distances(data)
n_points = data.shape[0]

fig, ax = plt.subplots(figsize=(1, 1))
ax.scatter(
    *data.T,
    s=0.5,
)
[175]:
<matplotlib.collections.PathCollection at 0x7fc65ed931f0>
../_images/benchmark_scans_90_1.png

Checks

[159]:
fig, ax = plt.subplots()
_ = plot.plot_histogram(ax, distances.flatten(), maxima=True, maxima_props={"order": 5})
../_images/benchmark_scans_92_0.png
[160]:
h, binmids = calc_histogram(distances.flatten(), save=True, base_name=figsrc_dir / f"{scan_name}_dist")
[161]:
print("Min: ", distances[distances > 0].min())
print("Max: ", distances.max())
Min:  9.927475001661245e-05
Max:  3.892931346774215
[162]:
r_array, min_n, max_n, mean_n = calc_n_neighbours_per_radius(
    distances, save=True, base_name=figsrc_dir / f"{scan_name}"
    )
[165]:
plot_n_neighbours_per_radius(
    r_array, min_n, max_n, mean_n,
    0.21, n_points
)
../_images/benchmark_scans_96_0.png

Scan

[176]:
scan_name = "moons"
clustering = CLUSTERING_MAP[scan_name] = cluster.prepare_clustering(data)
[177]:
for r in tqdm(np.arange(0.01, 0.61, 0.01)):
    neighbours = helper.compute_neighbours(data, r, sort=True)
    neighbours_clustering = cluster.prepare_clustering(
        neighbours,
        preparation_hook=cluster.prepare_neighbourhoods,
        **neighbours_sorted_recipe
    )

    for c in range(151):
        neighbours_clustering.fit(r, c, member_cutoff=20, v=False)
        clustering.summary.append(neighbours_clustering.summary[-1])

Plots

[76]:
scan_name = "moons"
clustering = CLUSTERING_MAP[scan_name]

Time

[178]:
fig, ax = plt.subplots()
contour = clustering.summarize(
    ax=ax,
    quantity="execution_time",
    contour_props={
        "levels": 50,
    }
)[2][0]
colorbar = fig.colorbar(mappable=contour, ax=ax)
colorbar.set_label("time / s")

ax.set_xlim(0.01, 0.6)
ax.set_ylim(0, 150)
[178]:
(0.0, 150.0)
../_images/benchmark_scans_103_1.png

Noise level

[179]:
# Noise level
fig, ax = plt.subplots()
contour = clustering.summarize(
    ax=ax,
    quantity="ratio_noise",
    contour_props={
        "levels": np.arange(0, 1.01, 0.01),
        "vmin": 0,
        "vmax": 1,
    }
)[2][0]
colorbar = fig.colorbar(mappable=contour, ax=ax, ticks=(0, 0.5, 1))
colorbar.set_label("noise / %")

ax.set_xlim(0.01, 0.6)
ax.set_ylim(0, 150)
[179]:
(0.0, 150.0)
../_images/benchmark_scans_105_1.png

Largest cluster

[180]:
# Largest cluster
fig, ax = plt.subplots()
contour = clustering.summarize(
    ax=ax,
    quantity="ratio_largest",
    contour_props={
        "levels": np.arange(0, 1.01, 0.01),
        "vmin": 0,
        "vmax": 1,
    }
)[2][0]
colorbar = fig.colorbar(mappable=contour, ax=ax, ticks=(0, 0.5, 1))
colorbar.set_label("largest / %")

ax.set_xlim(0.01, 0.6)
ax.set_ylim(0, 150)
[180]:
(0.0, 150.0)
../_images/benchmark_scans_107_1.png

Number of clusters

[181]:
# Number of clusters
show_n = 10

fig, ax = plt.subplots()
contour = clustering.summarize(
    ax=ax,
    quantity="n_clusters",
    contour_props={
        "levels": np.arange(-0.5, show_n + 1.5, 1),
        "vmin": 0,
        "vmax": show_n,
        "extend": "max"
    }
)[2][0]
colorbar = fig.colorbar(mappable=contour, ax=ax, ticks=range(0, show_n + 1, 2))
colorbar.set_label("# clusters")

ax.set_xlim(0.01, 0.6)
ax.set_ylim(0, 150)
[181]:
(0.0, 150.0)
../_images/benchmark_scans_109_1.png

Mono circle

[301]:
scan_name = "mono_circle"
[305]:
# The test data
data = helper.gen_circles_points(
    (5000, 2),
    factor=.5,
    shuffle=False,
    noise=.05
)
data = data[:2500]
n_points = data.shape[0]
distances = pairwise_distances(data)

fig, ax = plt.subplots(figsize=(1, 1))
ax.scatter(
    *data.T,
    s=0.5,
)
[305]:
<matplotlib.collections.PathCollection at 0x7fbbc63fbaf0>
../_images/benchmark_scans_112_1.png

Checks

[306]:
fig, ax = plt.subplots()
_ = plot.plot_histogram(ax, distances.flatten(), maxima=True, maxima_props={"order": 5})
../_images/benchmark_scans_114_0.png

Circles

[301]:
scan_name = "circles"
[302]:
# The test data
data = helper.gen_circles_points(
    (5000, 2),
    factor=.5,
    noise=.05
)
n_points = data.shape[0]
distances = pairwise_distances(data)

fig, ax = plt.subplots(figsize=(1, 1))
ax.scatter(
    *data.T,
    s=0.5,
)
[302]:
<matplotlib.collections.PathCollection at 0x7fbbc5c45d90>
../_images/benchmark_scans_117_1.png

Checks

[190]:
fig, ax = plt.subplots()
_ = plot.plot_histogram(ax, distances.flatten(), maxima=True, maxima_props={"order": 5})
../_images/benchmark_scans_119_0.png
[191]:
h, binmids = calc_histogram(distances.flatten(), save=True, base_name=figsrc_dir / f"{scan_name}_dist")
[192]:
print("Min: ", distances[distances > 0].min())
print("Max: ", distances.max())
Min:  0.0001264168234122532
Max:  4.199475414902217
[193]:
r_array, min_n, max_n, mean_n = calc_n_neighbours_per_radius(
    distances, save=True, base_name=figsrc_dir / f"{scan_name}"
    )
[194]:
plot_n_neighbours_per_radius(
    r_array, min_n, max_n, mean_n,
    0.23, n_points
)
../_images/benchmark_scans_123_0.png

Scan

[47]:
scan_name = "circles"
clustering = CLUSTERING_MAP[scan_name] = cluster.prepare_clustering(data)
[195]:
for r in tqdm(np.arange(0.01, 0.61, 0.01)):
    neighbours = helper.compute_neighbours(data, r, sort=True)
    neighbours_clustering = cluster.prepare_clustering(
        neighbours,
        preparation_hook=cluster.prepare_neighbourhoods,
        **neighbours_sorted_recipe
    )

    for c in range(151):
        neighbours_clustering.fit(r, c, member_cutoff=20, v=False)
        clustering.summary.append(neighbours_clustering.summary[-1])

Plots

[81]:
scan_name = "circles"
clustering = CLUSTERING_MAP[scan_name]

Time

[196]:
fig, ax = plt.subplots()
contour = clustering.summarize(
    ax=ax,
    quantity="execution_time",
    contour_props={
        "levels": 50,
    }
)[2][0]
colorbar = fig.colorbar(mappable=contour, ax=ax)
colorbar.set_label("time / s")

ax.set_xlim(0.01, 0.6)
ax.set_ylim(0, 150)
[196]:
(0.0, 150.0)
../_images/benchmark_scans_130_1.png

Noise level

[197]:
# Noise level
fig, ax = plt.subplots()
contour = clustering.summarize(
    ax=ax,
    quantity="ratio_noise",
    contour_props={
        "levels": np.arange(0, 1.01, 0.01),
        "vmin": 0,
        "vmax": 1,
    }
)[2][0]
colorbar = fig.colorbar(mappable=contour, ax=ax, ticks=(0, 0.5, 1))
colorbar.set_label("noise / %")

ax.set_xlim(0.01, 0.6)
ax.set_ylim(0, 150)
[197]:
(0.0, 150.0)
../_images/benchmark_scans_132_1.png

Largest cluster

[198]:
# Largest cluster
fig, ax = plt.subplots()
contour = clustering.summarize(
    ax=ax,
    quantity="ratio_largest",
    contour_props={
        "levels": np.arange(0, 1.01, 0.01),
        "vmin": 0,
        "vmax": 1,
    }
)[2][0]
colorbar = fig.colorbar(mappable=contour, ax=ax, ticks=(0, 0.5, 1))
colorbar.set_label("largest / %")

ax.set_xlim(0.01, 0.6)
ax.set_ylim(0, 150)
[198]:
(0.0, 150.0)
../_images/benchmark_scans_134_1.png

Number of clusters

[85]:
# Number of clusters
show_n = 10

fig, ax = plt.subplots()
contour = clustering.summarize(
    ax=ax,
    quantity="n_clusters",
    contour_props={
        "levels": np.arange(-0.5, show_n + 1.5, 1),
        "vmin": 0,
        "vmax": show_n,
        "extend": "max"
    }
)[2][0]
colorbar = fig.colorbar(mappable=contour, ax=ax, ticks=range(0, show_n + 1, 2))
colorbar.set_label("# clusters")

ax.set_xlim(0.01, 0.6)
ax.set_ylim(0, 150)
[85]:
(0.0, 150.0)
../_images/benchmark_scans_136_1.png

Varied

[351]:
scan_name = "varied"
[352]:
# The test data
data = helper.gen_blobs_points(
    (5000, 2),
    random_state=170,
    cluster_std=[1.0, 2.5, 0.5],
)
n_points = data.shape[0]
distances = pairwise_distances(data)

fig, ax = plt.subplots(figsize=(1, 1))
ax.scatter(
    *data.T,
    s=0.5,
)
[352]:
<matplotlib.collections.PathCollection at 0x7fbbc765fc70>
../_images/benchmark_scans_139_1.png
[353]:
np.save(figsrc_dir / f"{scan_name}/{scan_name}_data.npy", data)

Checks

[299]:
fig, ax = plt.subplots()
_ = plot.plot_histogram(ax, distances.flatten(), maxima=True, maxima_props={"order": 5})
../_images/benchmark_scans_142_0.png
[285]:
h, binmids = calc_histogram(distances.flatten(), save=True, base_name=figsrc_dir / f"{scan_name}/{scan_name}_dist")
[286]:
print("Min: ", distances[distances > 0].min())
print("Max: ", distances.max())
Min:  0.00021837824482724168
Max:  5.457257351668825
[300]:
r_array, min_n, max_n, mean_n, highlights_distributions = calc_n_neighbours_per_radius(
    distances,
    highlights=np.array([0.19, 1.56, 2.97]),
    hist_props={"bins": 20},
    save=True,
    base_name=figsrc_dir / f"{scan_name}/{scan_name}"
    )
[289]:
plot_n_neighbours_per_radius(
    r_array, min_n, max_n, mean_n,
    0.19, n_points
)
../_images/benchmark_scans_146_0.png

Scan

[206]:
scan_name = "varied"
clustering = CLUSTERING_MAP[scan_name] = cluster.prepare_clustering(data)
[207]:
for r in tqdm(np.arange(0.01, 0.51, 0.01)):
    neighbours = helper.compute_neighbours(data, r, sort=True)
    neighbours_clustering = cluster.prepare_clustering(
        neighbours,
        preparation_hook=cluster.prepare_neighbourhoods,
        **neighbours_sorted_recipe
    )

    for c in range(151):
        neighbours_clustering.fit(r, c, member_cutoff=20, v=False)
        clustering.summary.append(neighbours_clustering.summary[-1])

Plots

[160]:
scan_name = "varied"
clustering = CLUSTERING_MAP[scan_name]
[161]:
clustering.summary.to_DataFrame()
[161]:
n_points radius_cutoff cnn_cutoff member_cutoff max_clusters n_clusters ratio_largest ratio_noise execution_time
0 5000 0.01 0 20 <NA> 5 0.0818 0.8856 0.001223
1 5000 0.01 1 20 <NA> 6 0.0198 0.9528 0.001058
2 5000 0.01 2 20 <NA> 2 0.0174 0.9784 0.000993
3 5000 0.01 3 20 <NA> 1 0.0108 0.9892 0.000940
4 5000 0.01 4 20 <NA> 1 0.0086 0.9914 0.000867
... ... ... ... ... ... ... ... ... ...
7545 5000 0.50 146 20 <NA> 1 0.9252 0.0748 0.555526
7546 5000 0.50 147 20 <NA> 1 0.9252 0.0748 0.568364
7547 5000 0.50 148 20 <NA> 1 0.9242 0.0758 0.595478
7548 5000 0.50 149 20 <NA> 1 0.9238 0.0762 0.584090
7549 5000 0.50 150 20 <NA> 1 0.9228 0.0772 0.578298

7550 rows × 9 columns

Time

[208]:
fig, ax = plt.subplots()
contour = clustering.summarize(
    ax=ax,
    quantity="execution_time",
    contour_props={
        "levels": 50,
    }
)[2][0]
colorbar = fig.colorbar(mappable=contour, ax=ax)
colorbar.set_label("time / s")

ax.set_xlim(0.01, 0.5)
ax.set_ylim(0, 150)
[208]:
(0.0, 150.0)
../_images/benchmark_scans_154_1.png

Noise level

[209]:
# Noise level
fig, ax = plt.subplots()
contour = clustering.summarize(
    ax=ax,
    quantity="ratio_noise",
    contour_props={
        "levels": np.arange(0, 1.01, 0.01),
        "vmin": 0,
        "vmax": 1,
    }
)[2][0]
colorbar = fig.colorbar(mappable=contour, ax=ax, ticks=(0, 0.5, 1))
colorbar.set_label("noise / %")
ax.set_xlim(0.01, 0.5)
ax.set_ylim(0, 150)
[209]:
(0.0, 150.0)
../_images/benchmark_scans_156_1.png

Largest cluster

[210]:
# Largest cluster
fig, ax = plt.subplots()
contour = clustering.summarize(
    ax=ax,
    quantity="ratio_largest",
    contour_props={
        "levels": np.arange(0, 1.01, 0.01),
        "vmin": 0,
        "vmax": 1,
    }
)[2][0]
colorbar = fig.colorbar(mappable=contour, ax=ax, ticks=(0, 0.5, 1))
colorbar.set_label("largest / %")
ax.set_xlim(0.01, 0.5)
ax.set_ylim(0, 150)
[210]:
(0.0, 150.0)
../_images/benchmark_scans_158_1.png

Number of clusters

[211]:
# Number of clusters
show_n = 10

fig, ax = plt.subplots()
contour = clustering.summarize(
    ax=ax,
    quantity="n_clusters",
    contour_props={
        "levels": np.arange(-0.5, show_n + 1.5, 1),
        "vmin": 0,
        "vmax": show_n,
        "extend": "max"
    }
)[2][0]
colorbar = fig.colorbar(mappable=contour, ax=ax, ticks=range(0, show_n + 1, 2))
colorbar.set_label("# clusters")
ax.set_xlim(0.01, 0.5)
ax.set_ylim(0, 150)
[211]:
(0.0, 150.0)
../_images/benchmark_scans_160_1.png

Evaluation

[166]:
r = 0.2
c = 50

neighbours = helper.compute_neighbours(data, r, sort=True)
neighbours_clustering = cluster.prepare_clustering(
    neighbours,
    preparation_hook=cluster.prepare_neighbourhoods,
    **neighbours_sorted_recipe
)

neighbours_clustering.fit(0., c, member_cutoff=20, v=False)
clustering._labels = neighbours_clustering._labels
[168]:
fig, ax = plt.subplots()
clustering.evaluate(ax=ax)
[168]:
(<Figure size 600x370.828 with 1 Axes>,
 <AxesSubplot:xlabel='$x$', ylabel='$y$'>)
../_images/benchmark_scans_163_1.png

Aniso

[213]:
scan_name = "aniso"
[212]:
# The test data
data = helper.gen_blobs_points(
    (5000, 2),
    random_state=170,

)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
data = np.dot(data, transformation)
n_points = data.shape[0]

distances = pairwise_distances(data)

fig, ax = plt.subplots(figsize=(1, 1))
ax.scatter(
    *data.T,
    s=0.5,
)
[212]:
<matplotlib.collections.PathCollection at 0x7fc65ef777f0>
../_images/benchmark_scans_166_1.png

Checks

[214]:
fig, ax = plt.subplots()
_ = plot.plot_histogram(ax, distances.flatten(), maxima=True, maxima_props={"order": 5})
../_images/benchmark_scans_168_0.png
[215]:
h, binmids = calc_histogram(distances.flatten(), save=True, base_name=figsrc_dir / f"{scan_name}_dist")
[216]:
print("Min: ", distances[distances > 0].min())
print("Max: ", distances.max())
Min:  0.0001934117086504001
Max:  3.6222164035708793
[217]:
r_array, min_n, max_n, mean_n = calc_n_neighbours_per_radius(
    distances, save=True, base_name=figsrc_dir / f"{scan_name}"
    )
[218]:
plot_n_neighbours_per_radius(
    r_array, min_n, max_n, mean_n,
    0.13, n_points
)
../_images/benchmark_scans_172_0.png

Scan

[219]:
scan_name = "aniso"
clustering = CLUSTERING_MAP[scan_name] = cluster.prepare_clustering(data)
[220]:
for r in tqdm(np.arange(0.01, 0.61, 0.01)):
    neighbours = helper.compute_neighbours(data, r, sort=True)
    neighbours_clustering = cluster.prepare_clustering(
        neighbours,
        preparation_hook=cluster.prepare_neighbourhoods,
        **neighbours_sorted_recipe
    )

    for c in range(151):
        neighbours_clustering.fit(r, c, member_cutoff=20, v=False)
        clustering.summary.append(neighbours_clustering.summary[-1])

Plots

[99]:
scan_name = "aniso"
clustering = CLUSTERING_MAP[scan_name]

Time

[221]:
fig, ax = plt.subplots()
contour = clustering.summarize(
    ax=ax,
    quantity="execution_time",
    contour_props={
        "levels": 50,
    }
)[2][0]
colorbar = fig.colorbar(mappable=contour, ax=ax)
colorbar.set_label("time / s")

ax.set_xlim(0.01, 0.6)
ax.set_ylim(0, 150)
[221]:
(0.0, 150.0)
../_images/benchmark_scans_179_1.png

Noise level

[222]:
# Noise level
fig, ax = plt.subplots()
contour = clustering.summarize(
    ax=ax,
    quantity="ratio_noise",
    contour_props={
        "levels": np.arange(0, 1.01, 0.01),
        "vmin": 0,
        "vmax": 1,
    }
)[2][0]
colorbar = fig.colorbar(mappable=contour, ax=ax, ticks=(0, 0.5, 1))
colorbar.set_label("noise / %")
ax.set_xlim(0.01, 0.6)
ax.set_ylim(0, 150)
[222]:
(0.0, 150.0)
../_images/benchmark_scans_181_1.png
[223]:
# Noise level
fig, ax = plt.subplots()
contour = clustering.summarize(
    ax=ax,
    quantity="ratio_noise",
    contour_props={
        "levels": np.arange(0, 1.01, 0.01),
        "vmin": 0,
        "vmax": 1,
    }
)[2][0]
colorbar = fig.colorbar(mappable=contour, ax=ax, ticks=(0, 0.5, 1))
colorbar.set_label("noise / %")
ax.set_xlim(0.01, 0.13)
ax.set_ylim(0, 150)
[223]:
(0.0, 150.0)
../_images/benchmark_scans_182_1.png

Largest cluster

[224]:
# Largest cluster
fig, ax = plt.subplots()
contour = clustering.summarize(
    ax=ax,
    quantity="ratio_largest",
    contour_props={
        "levels": np.arange(0, 1.01, 0.01),
        "vmin": 0,
        "vmax": 1,
    }
)[2][0]
colorbar = fig.colorbar(mappable=contour, ax=ax, ticks=(0, 0.5, 1))
colorbar.set_label("largest / %")
ax.set_xlim(0.01, 0.6)
ax.set_ylim(0, 150)
[224]:
(0.0, 150.0)
../_images/benchmark_scans_184_1.png

Number of clusters

[225]:
# Number of clusters
show_n = 10

fig, ax = plt.subplots()
contour = clustering.summarize(
    ax=ax,
    quantity="n_clusters",
    contour_props={
        "levels": np.arange(-0.5, show_n + 1.5, 1),
        "vmin": 0,
        "vmax": show_n,
        "extend": "max"
    }
)[2][0]
colorbar = fig.colorbar(mappable=contour, ax=ax, ticks=range(0, show_n + 1, 2))
colorbar.set_label("# clusters")
ax.set_xlim(0.01, 0.6)
ax.set_ylim(0, 150)
[225]:
(0.0, 150.0)
../_images/benchmark_scans_186_1.png

Evaluation

[106]:
r = 0.13
c = 50

neighbours = helper.compute_neighbours(data, r, sort=True)
neighbours_clustering = cluster.prepare_clustering(
    neighbours,
    preparation_hook=cluster.prepare_neighbourhoods,
    **neighbours_sorted_recipe
)

neighbours_clustering.fit(0., c, member_cutoff=20, v=False)
clustering._labels = neighbours_clustering._labels
[107]:
fig, ax = plt.subplots(figsize=(2, 2/1.618))
clustering.evaluate(ax=ax)
[107]:
(<Figure size 1800x1200 with 1 Axes>, <AxesSubplot:xlabel='$x$', ylabel='$y$'>)
../_images/benchmark_scans_189_1.png

Backbone dihedrals

[378]:
scan_name = "backbone"
[379]:
# The test data
data_path = data_dir / "Alanine/projections/phipsi.npy"
data = [p[::400, :] for p in np.load(data_path)]
[380]:
data_concatened = np.array(np.concatenate(data), dtype=np.float64, order="c")
n_points = data_concatened.shape[0]
[368]:
np.save(figsrc_dir / f"{scan_name}/{scan_name}_data.npy", data_concatened)
[381]:
fig, ax = plt.subplots(figsize=(1, 1))
ax.scatter(
    *data_concatened.T,
    s=0.5,
)
ax.set_xlim((-np.pi, np.pi))
ax.set_ylim((-np.pi, np.pi))
[381]:
(-3.141592653589793, 3.141592653589793)
../_images/benchmark_scans_195_1.png
[382]:
data_alt = _types.InputDataExtPointsMemoryview(data_concatened)
metric = _types.MetricExtEuclideanPeriodicReduced(np.array([2*np.pi, 2*np.pi], dtype=float))

distances = np.zeros((n_points, n_points))
for i in range(n_points):
    for j in range(i + 1, n_points):
        distances[i, j] = np.sqrt(metric.calc_distance(i, j, data_alt))

tril = np.tril_indices_from(distances)
distances[tril] = distances.T[tril]

Checks

[383]:
fig, ax = plt.subplots()
_ = plot.plot_histogram(ax, distances.flatten(), maxima=True, maxima_props={"order": 5})
../_images/benchmark_scans_198_0.png
[272]:
h, binmids = calc_histogram(distances.flatten(), save=True, base_name=figsrc_dir / f"{scan_name}/{scan_name}_dist")
[269]:
print("Min: ", distances[distances > 0].min())
print("Max: ", distances.max())
Min:  0.0001605242388684838
Max:  4.439522323492073
[270]:
scan_name
[270]:
'backbone'
[281]:
r_array, min_n, max_n, mean_n, highlights_distributions = calc_n_neighbours_per_radius(
    distances,
    highlights=np.array([0.42, 3.09]),
    hist_props={"bins": 20},
    save=True,
    base_name=figsrc_dir / f"{scan_name}/{scan_name}"
    )
[ ]:

[386]:
plot_n_neighbours_per_radius(
    r_array, min_n, max_n, mean_n,
    0.42, n_points
)
../_images/benchmark_scans_204_0.png

Scan

[384]:
scan_name = "backbone"
clustering = CLUSTERING_MAP[scan_name] = cluster.prepare_clustering(data)
[385]:
for record in RECORD_MAP[scan_name]:
    clustering.summary.append(record)
[215]:
# RECORD_MAP[scan_name] = clustering.summary._list
[ ]:

[ ]:
for r in tqdm(np.arange(0.01, 1.01, 0.01)):

    neighbours = [np.where(d < r)[0] for d in distances]
    for n in neighbours:
        n.sort()

    neighbours_clustering = cluster.prepare_clustering(
        neighbours,
        preparation_hook=cluster.prepare_neighbourhoods,
        **neighbours_sorted_recipe
    )

    for c in range(251, 2000):
        neighbours_clustering.fit(r, c, member_cutoff=20, v=False)
        clustering.summary.append(neighbours_clustering.summary[-1])
[397]:
scan_name = "backbone"

overwrite = True

record_file = pathlib.Path(f"records/{scan_name}.json")
if record_file.is_file() and not overwrite:
    raise RuntimeError(f"File exists: str(record_file)")

with open(record_file, "w") as fp:
    json.dump({scan_name: CLUSTERING_MAP[scan_name].summary._list}, fp, cls=helper.RecordEncoder, indent=4)

Plots

[137]:
scan_name = "backbone"
clustering = CLUSTERING_MAP[scan_name]

Time

[391]:
fig, ax = plt.subplots(figsize=(2, 2/1.618))
contour = clustering.summarize(
    ax=ax,
    quantity="execution_time",
    contour_props={
        "levels": 100,
    }
)[2][0]
colorbar = fig.colorbar(mappable=contour, ax=ax)
colorbar.set_label("time / s")

ax.set_xlim(0.01, 1)
ax.set_ylim(0, 2000)
[391]:
(0.0, 2000.0)
../_images/benchmark_scans_215_1.png

Noise level

[392]:
# Noise level
fig, ax = plt.subplots(figsize=(2, 2/1.618))
contour = clustering.summarize(
    ax=ax,
    quantity="ratio_noise",
    contour_props={
        "levels": np.arange(0, 1.01, 0.01),
        "vmin": 0,
        "vmax": 1,
    }
)[2][0]
colorbar = fig.colorbar(mappable=contour, ax=ax, ticks=(0, 0.5, 1))
colorbar.set_label("noise / %")

ax.set_xlim(0.01, 1)
ax.set_ylim(0, 2000)
[392]:
(0.0, 2000.0)
../_images/benchmark_scans_217_1.png

Largest cluster

[394]:
# Largest cluster
fig, ax = plt.subplots(figsize=(2, 2/1.618))
contour = clustering.summarize(
    ax=ax,
    quantity="ratio_largest",
    contour_props={
        "levels": np.arange(0, 1.01, 0.01),
        "vmin": 0,
        "vmax": 1,
    }
)[2][0]
colorbar = fig.colorbar(mappable=contour, ax=ax, ticks=(0, 0.5, 1))
colorbar.set_label("largest / %")

ax.set_xlim(0.01, 1)
ax.set_ylim(0, 2000)
[394]:
(0.0, 2000.0)
../_images/benchmark_scans_219_1.png

Number of clusters

[395]:
# Number of clusters
show_n = 10

fig, ax = plt.subplots(figsize=(2, 2/1.618))
contour = clustering.summarize(
    ax=ax,
    quantity="n_clusters",
    contour_props={
        "levels": np.arange(-0.5, show_n + 1.5, 1),
        "vmin": 0,
        "vmax": show_n,
        "extend": "max"
    }
)[2][0]
colorbar = fig.colorbar(mappable=contour, ax=ax, ticks=range(0, show_n + 1, 2))
colorbar.set_label("# clusters")

ax.set_xlim(0.01, 1)
ax.set_ylim(0, 2000)
[395]:
(0.0, 2000.0)
../_images/benchmark_scans_221_1.png

Evaluation

[69]:
scan_name = "backbone"
clustering = cluster.prepare_clustering(data)
[236]:
plt.close("all")
fig, Ax = plt.subplots(4, 5, figsize=(7.2, 6.0))

c_list = [1, 10, 50, 100]
r_list = [0.5, 0.2, 0.13, 0.09, 0.068]

def get_member_cutoff(r, c):
    if r < 0.2:
        return 20
    return 10

for i, c in enumerate(c_list):
    for j, r in enumerate(r_list):

        neighbours = [np.where(d < r)[0] for d in distances]
        for n in neighbours:
            n.sort()

        neighbours_clustering = cluster.prepare_clustering(
            neighbours,
            preparation_hook=cluster.prepare_neighbourhoods,
            **neighbours_sorted_recipe
        )

        neighbours_clustering.fit(r, c, member_cutoff=get_member_cutoff(r, c), v=False)
        clustering._labels = neighbours_clustering._labels

        # np.save(figsrc_dir / scan_name / f"{scan_name}_labels_20_30.npy", clustering.labels.labels)
        clustering.evaluate(
            ax=Ax[i, j],
            ax_props={
                "title": f"$r={r}$ $c={c}$",
                "xlim": (-np.pi, np.pi),
                "ylim": (-np.pi, np.pi),
                "xticks": (),
                "yticks": (),
                "xlabel": None,
                "ylabel": None,
                "aspect": "equal"
                },
            annotate_pos="random"
            )

# for j, r in enumerate(r_list):
#     Ax[0, j].annotate(
#         xy=(0.5, 1),
#         text=f"$r={r}$",
#         xycoords='axes fraction',
#     )

Ax[-1, 0].set(**{
    "xlim": (-np.pi, np.pi),
    "ylim": (-np.pi, np.pi),
    "xticks": ((-np.pi, 0, np.pi)),
    "yticks": ((-np.pi, 0, np.pi)),
    "xticklabels": (("$-\pi$", 0, "$\pi$")),
    "yticklabels": (("$-\pi$", 0, "$\pi$")),
    "xlabel": "$\phi$",
    "ylabel": "$\psi$",
    "aspect": "equal"
})
fig.tight_layout(pad=0.1, w_pad=0.01, h_pad=0.01)
../_images/benchmark_scans_224_0.png
[246]:
plt.close("all")
fig, Ax = plt.subplots(3, 5, figsize=(7.2, 3.33))
Ax = Ax.flatten()

c_list = [1, 10, 20, 30, 50, 100, 150, 250, 500, 750, 1000, 1500, 2000, 2500, 3000]
r_list = [0.42] * 15

for i, (r, c) in enumerate(zip(r_list, c_list)):

        neighbours = [np.where(d < r)[0] for d in distances]
        for n in neighbours:
            n.sort()

        neighbours_clustering = cluster.prepare_clustering(
            neighbours,
            preparation_hook=cluster.prepare_neighbourhoods,
            **neighbours_sorted_recipe
        )

        neighbours_clustering.fit(r, c, member_cutoff=10, v=False)
        clustering._labels = neighbours_clustering._labels

        # np.save(figsrc_dir / scan_name / f"{scan_name}_labels_20_30.npy", clustering.labels.labels)
        clustering.evaluate(
            ax=Ax[i],
            ax_props={
                "title": f"$r={r}$ $c={c}$",
                "xlim": (-np.pi, np.pi),
                "ylim": (-np.pi, np.pi),
                "xticks": (),
                "yticks": (),
                "xlabel": None,
                "ylabel": None,
                "aspect": "equal"
                },
            annotate_pos="random"
            )

# for j, r in enumerate(r_list):
#     Ax[0, j].annotate(
#         xy=(0.5, 1),
#         text=f"$r={r}$",
#         xycoords='axes fraction',
#     )

Ax[5].set(**{
    "xlim": (-np.pi, np.pi),
    "ylim": (-np.pi, np.pi),
    "xticks": ((-np.pi, 0, np.pi)),
    "yticks": ((-np.pi, 0, np.pi)),
    "xticklabels": (("$-\pi$", 0, "$\pi$")),
    "yticklabels": (("$-\pi$", 0, "$\pi$")),
    "xlabel": "$\phi$",
    "ylabel": "$\psi$",
    "aspect": "equal"
})
fig.tight_layout(pad=0.1, w_pad=0.01, h_pad=0.01)
../_images/benchmark_scans_225_0.png
[249]:
plt.close("all")
fig, Ax = plt.subplots(3, 5, figsize=(7.2, 3.33))
Ax = Ax.flatten()

c_list = [1, 5, 10, 20, 30, 50, 75, 100, 150, 200, 300, 400, 500, 600, 750]
r_list = [0.2] * 15

for i, (r, c) in enumerate(zip(r_list, c_list)):

        neighbours = [np.where(d < r)[0] for d in distances]
        for n in neighbours:
            n.sort()

        neighbours_clustering = cluster.prepare_clustering(
            neighbours,
            preparation_hook=cluster.prepare_neighbourhoods,
            **neighbours_sorted_recipe
        )

        neighbours_clustering.fit(r, c, member_cutoff=10, v=False)
        clustering._labels = neighbours_clustering._labels

        # np.save(figsrc_dir / scan_name / f"{scan_name}_labels_20_30.npy", clustering.labels.labels)
        clustering.evaluate(
            ax=Ax[i],
            ax_props={
                "title": f"$r={r}$ $c={c}$",
                "xlim": (-np.pi, np.pi),
                "ylim": (-np.pi, np.pi),
                "xticks": (),
                "yticks": (),
                "xlabel": None,
                "ylabel": None,
                "aspect": "equal"
                },
            annotate_pos="random"
            )

# for j, r in enumerate(r_list):
#     Ax[0, j].annotate(
#         xy=(0.5, 1),
#         text=f"$r={r}$",
#         xycoords='axes fraction',
#     )

Ax[5].set(**{
    "xlim": (-np.pi, np.pi),
    "ylim": (-np.pi, np.pi),
    "xticks": ((-np.pi, 0, np.pi)),
    "yticks": ((-np.pi, 0, np.pi)),
    "xticklabels": (("$-\pi$", 0, "$\pi$")),
    "yticklabels": (("$-\pi$", 0, "$\pi$")),
    "xlabel": "$\phi$",
    "ylabel": "$\psi$",
    "aspect": "equal"
})
fig.tight_layout(pad=0.1, w_pad=0.01, h_pad=0.01)
../_images/benchmark_scans_226_0.png
[250]:
plt.close("all")
fig, Ax = plt.subplots(3, 5, figsize=(7.2, 3.33))
Ax = Ax.flatten()

c_list = [1, 10, 20, 30, 50, 100, 150, 250, 500, 750, 1000, 1500, 2000, 2500, 3000]
r_list = [0.75] * 15

for i, (r, c) in enumerate(zip(r_list, c_list)):

        neighbours = [np.where(d < r)[0] for d in distances]
        for n in neighbours:
            n.sort()

        neighbours_clustering = cluster.prepare_clustering(
            neighbours,
            preparation_hook=cluster.prepare_neighbourhoods,
            **neighbours_sorted_recipe
        )

        neighbours_clustering.fit(r, c, member_cutoff=10, v=False)
        clustering._labels = neighbours_clustering._labels

        # np.save(figsrc_dir / scan_name / f"{scan_name}_labels_20_30.npy", clustering.labels.labels)
        clustering.evaluate(
            ax=Ax[i],
            ax_props={
                "title": f"$r={r}$ $c={c}$",
                "xlim": (-np.pi, np.pi),
                "ylim": (-np.pi, np.pi),
                "xticks": (),
                "yticks": (),
                "xlabel": None,
                "ylabel": None,
                "aspect": "equal"
                },
            annotate_pos="random"
            )

# for j, r in enumerate(r_list):
#     Ax[0, j].annotate(
#         xy=(0.5, 1),
#         text=f"$r={r}$",
#         xycoords='axes fraction',
#     )

Ax[5].set(**{
    "xlim": (-np.pi, np.pi),
    "ylim": (-np.pi, np.pi),
    "xticks": ((-np.pi, 0, np.pi)),
    "yticks": ((-np.pi, 0, np.pi)),
    "xticklabels": (("$-\pi$", 0, "$\pi$")),
    "yticklabels": (("$-\pi$", 0, "$\pi$")),
    "xlabel": "$\phi$",
    "ylabel": "$\psi$",
    "aspect": "equal"
})
fig.tight_layout(pad=0.1, w_pad=0.01, h_pad=0.01)
../_images/benchmark_scans_227_0.png
Hierarchical manual
[207]:
r = 0.5
c = 10

h_clustering = cluster.prepare_clustering(data)
h_clustering._metric = _types.MetricExtEuclideanPeriodicReduced(
    np.array([2*np.pi, 2*np.pi], dtype=float)
)

h_clustering.fit(r, c, member_cutoff=10, v=False)
h_clustering.isolate()
[208]:
for sub_clustering, (r, c) in [(h_clustering.children[1], (0.16, 10))]:
    sub_clustering.fit(r, c, member_cutoff=20, v=False)
    sub_clustering.isolate()
[209]:
fig, (pie_ax, tree_ax) = plt.subplots(1, 2, figsize=(3.33, 3.33 / 1.618 / 2))
h_clustering.pie(
    ax=pie_ax,
    pie_props={
        "radius": 0.6,
        "wedgeprops": dict(width=0.6, edgecolor="k")
    }
)

graph = h_clustering.to_nx_DiGraph(ignore={0})

shortened_labels = {}
for key in graph.nodes.keys():
    shortened_labels[key] = key.rsplit(".", 1)[-1]

cycler = mpl.rcParams["axes.prop_cycle"]
colors = cycler.by_key()["color"]
depth_counter = defaultdict(int)

node_colors = []
for node in graph.nodes:
    depth = len(node)
    node_colors.append(colors[depth_counter[depth]])
    depth_counter[depth] += 1

plot.plot_graph_sugiyama_straight(
    graph,
    ax=tree_ax,
    pos_props = {
        "source": "1",
        },
    draw_props={
        "labels": shortened_labels,
        "with_labels": True,
        "node_shape": "s",
        "edgecolors": "k",
        "node_size": 100,
        "node_color": node_colors,  # [cycler.by_key()["color"][int(shortened_labels[node]) - 1] for node in graph.nodes ],
        "font_size": 8,
    }
)
fig.tight_layout()
# fig.savefig(figsrc_dir / scan_name / f"{scan_name}_pie_tree.png")
../_images/benchmark_scans_231_0.png
[210]:
h_clustering.reel()
h_clustering.labels.sort_by_size()
[212]:
fig, ax = plt.subplots()
h_clustering.evaluate(
        ax=ax,
        ax_props={
            "xlim": (-np.pi, np.pi),
            "ylim": (-np.pi, np.pi),
            "xticks": ((-np.pi, 0, np.pi)),
            "yticks": ((-np.pi, 0, np.pi)),
            "xticklabels": (("$-\pi$", 0, "$\pi$")),
            "yticklabels": (("$-\pi$", 0, "$\pi$")),
            "aspect": "equal"
            },
        annotate_pos="random"
        )
[212]:
(<Figure size 600x370.828 with 1 Axes>,
 <AxesSubplot:xlabel='$x$', ylabel='$y$'>)
../_images/benchmark_scans_233_1.png
[494]:
np.save(figsrc_dir / scan_name / f"{scan_name}_labels_reeled.npy", h_clustering.labels.labels)

Peptide (TICA)

[369]:
scan_name = "tica"
[370]:
# The test data
data_path = data_dir / "6a5j/projections/tica_500_4.npy"
data = [p[::10, :] for p in np.load(data_path)]
[371]:
data_concatened = np.concatenate(data)
n_points = data_concatened.shape[0]

distances = pairwise_distances(data_concatened[::2])
[364]:
np.save(figsrc_dir / f"{scan_name}/{scan_name}_data.npy", data_concatened)
[293]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(2, 1))
ax1.scatter(
    *data_concatened[:, :2].T,
    s=0.5,
)
ax2.scatter(
    *data_concatened[:, 2:].T,
    s=0.5,
)
plt.tight_layout(w_pad=1)
../_images/benchmark_scans_240_0.png

Checks

[294]:
fig, ax = plt.subplots()
_ = plot.plot_histogram(ax, distances.flatten(), maxima=True, maxima_props={"order": 5})
../_images/benchmark_scans_242_0.png
[295]:
h, binmids = calc_histogram(
    distances.flatten(),
    save=True,
    base_name=figsrc_dir / f"{scan_name}/{scan_name}_dist"
)
[267]:
print("Min: ", distances[distances > 0].min())
print("Max: ", distances.max())
Min:  0.005043478
Max:  6.638115
[375]:
r_array, min_n, max_n, mean_n, highlights_distributions = calc_n_neighbours_per_radius(
    distances,
    points_factor=2,
    highlights=np.array([0.33, 1.83, 2.62, 3.95]),
    hist_props={"bins": 20},
    save=True,
    base_name=figsrc_dir / f"{scan_name}/{scan_name}"
    )
[271]:
plot_n_neighbours_per_radius(
    r_array, min_n * 2, max_n * 2, mean_n * 2,
    0.3, n_points
)
../_images/benchmark_scans_246_0.png

Scan

[272]:
scan_name = "tica"
clustering = CLUSTERING_MAP[scan_name] = cluster.prepare_clustering(data)
[ ]:
for r in tqdm(np.arange(0.01, 1.01, 0.01)):
    neighbours = helper.compute_neighbours(data_concatened, r, sort=True)
    neighbours_clustering = cluster.prepare_clustering(
        neighbours,
        preparation_hook=cluster.prepare_neighbourhoods,
        **neighbours_sorted_recipe
    )

    for c in range(251):
        neighbours_clustering.fit(r, c, member_cutoff=20, v=False)
        clustering.summary.append(neighbours_clustering.summary[-1])
[401]:
for record in clustering.summary:
    record.radius_cutoff = round(record.radius_cutoff, 5)

Plots

[99]:
scan_name = "tica"
clustering = CLUSTERING_MAP[scan_name]

Time

[404]:
fig, ax = plt.subplots()
contour = clustering.summarize(
    ax=ax,
    quantity="execution_time",
    contour_props={
        "levels": 50,
    }
)[2][0]
colorbar = fig.colorbar(mappable=contour, ax=ax)
colorbar.set_label("time / s")

ax.set_xlim(0.01, 1.0)
ax.set_ylim(0, 250)
[404]:
(0.0, 250.0)
../_images/benchmark_scans_254_1.png

Noise level

[405]:
# Noise level
fig, ax = plt.subplots()
contour = clustering.summarize(
    ax=ax,
    quantity="ratio_noise",
    contour_props={
        "levels": np.arange(0, 1.01, 0.01),
        "vmin": 0,
        "vmax": 1,
    }
)[2][0]
colorbar = fig.colorbar(mappable=contour, ax=ax, ticks=(0, 0.5, 1))
colorbar.set_label("noise / %")
ax.set_xlim(0.01, 1.0)
ax.set_ylim(0, 250)
[405]:
(0.0, 250.0)
../_images/benchmark_scans_256_1.png

Largest cluster

[406]:
# Largest cluster
fig, ax = plt.subplots()
contour = clustering.summarize(
    ax=ax,
    quantity="ratio_largest",
    contour_props={
        "levels": np.arange(0, 1.01, 0.01),
        "vmin": 0,
        "vmax": 1,
    }
)[2][0]
colorbar = fig.colorbar(mappable=contour, ax=ax, ticks=(0, 0.5, 1))
colorbar.set_label("largest / %")
ax.set_xlim(0.01, 1.0)
ax.set_ylim(0, 250)
[406]:
(0.0, 250.0)
../_images/benchmark_scans_258_1.png

Number of clusters

[407]:
# Number of clusters
show_n = 10

fig, ax = plt.subplots()
contour = clustering.summarize(
    ax=ax,
    quantity="n_clusters",
    contour_props={
        "levels": np.arange(-0.5, show_n + 1.5, 1),
        "vmin": 0,
        "vmax": show_n,
        "extend": "max"
    }
)[2][0]
colorbar = fig.colorbar(mappable=contour, ax=ax, ticks=range(0, show_n + 1, 2))
colorbar.set_label("# clusters")
ax.set_xlim(0.01, 1.0)
ax.set_ylim(0, 250)
[407]:
(0.0, 250.0)
../_images/benchmark_scans_260_1.png

Evaluation

[13]:
scan_name = "tica"
clustering = cluster.prepare_clustering(data)
[410]:
r = 1
c = 40

neighbours = helper.compute_neighbours(data_concatened, r, sort=True)
neighbours_clustering = cluster.prepare_clustering(
    neighbours,
    preparation_hook=cluster.prepare_neighbourhoods,
    **neighbours_sorted_recipe
)

neighbours_clustering.fit(r, c, member_cutoff=20, v=False)
clustering._labels = neighbours_clustering._labels
[413]:
fig, Ax = plt.subplots(
    1, 2,
    figsize=(
        mpl.rcParams["figure.figsize"][0] * 1.5,
        mpl.rcParams["figure.figsize"][1],
    )
)
for axi, dim in enumerate(range(0, 4, 2)):
    clustering.evaluate(
        ax=Ax[axi],
        dim=(dim, dim + 1),
        ax_props={"xlabel": None, "ylabel": None, "xticks": (), "yticks": ()}
    )
../_images/benchmark_scans_264_0.png
[414]:
r = 0.75
c = 40

neighbours = helper.compute_neighbours(data_concatened, r, sort=True)
neighbours_clustering = cluster.prepare_clustering(
    neighbours,
    preparation_hook=cluster.prepare_neighbourhoods,
    **neighbours_sorted_recipe
)

neighbours_clustering.fit(r, c, member_cutoff=20, v=False)
clustering._labels = neighbours_clustering._labels
[415]:
fig, Ax = plt.subplots(
    1, 2,
    figsize=(
        mpl.rcParams["figure.figsize"][0] * 1.5,
        mpl.rcParams["figure.figsize"][1],
    )
)
for axi, dim in enumerate(range(0, 4, 2)):
    clustering.evaluate(
        ax=Ax[axi],
        dim=(dim, dim + 1),
        ax_props={"xlabel": None, "ylabel": None, "xticks": (), "yticks": ()}
    )
../_images/benchmark_scans_266_0.png
[416]:
r = 0.5
c = 40

neighbours = helper.compute_neighbours(data_concatened, r, sort=True)
neighbours_clustering = cluster.prepare_clustering(
    neighbours,
    preparation_hook=cluster.prepare_neighbourhoods,
    **neighbours_sorted_recipe
)

neighbours_clustering.fit(r, c, member_cutoff=20, v=False)
clustering._labels = neighbours_clustering._labels
[417]:
fig, Ax = plt.subplots(
    1, 2,
    figsize=(
        mpl.rcParams["figure.figsize"][0] * 1.5,
        mpl.rcParams["figure.figsize"][1],
    )
)
for axi, dim in enumerate(range(0, 4, 2)):
    clustering.evaluate(
        ax=Ax[axi],
        dim=(dim, dim + 1),
        ax_props={"xlabel": None, "ylabel": None, "xticks": (), "yticks": ()}
    )
../_images/benchmark_scans_268_0.png
[418]:
r = 0.25
c = 40

neighbours = helper.compute_neighbours(data_concatened, r, sort=True)
neighbours_clustering = cluster.prepare_clustering(
    neighbours,
    preparation_hook=cluster.prepare_neighbourhoods,
    **neighbours_sorted_recipe
)

neighbours_clustering.fit(r, c, member_cutoff=20, v=False)
clustering._labels = neighbours_clustering._labels
[419]:
fig, Ax = plt.subplots(
    1, 2,
    figsize=(
        mpl.rcParams["figure.figsize"][0] * 1.5,
        mpl.rcParams["figure.figsize"][1],
    )
)
for axi, dim in enumerate(range(0, 4, 2)):
    clustering.evaluate(
        ax=Ax[axi],
        dim=(dim, dim + 1),
        ax_props={"xlabel": None, "ylabel": None, "xticks": (), "yticks": ()}
    )
../_images/benchmark_scans_270_0.png
[420]:
r = 0.15
c = 40

neighbours = helper.compute_neighbours(data_concatened, r, sort=True)
neighbours_clustering = cluster.prepare_clustering(
    neighbours,
    preparation_hook=cluster.prepare_neighbourhoods,
    **neighbours_sorted_recipe
)

neighbours_clustering.fit(r, c, member_cutoff=20, v=False)
clustering._labels = neighbours_clustering._labels
[421]:
fig, Ax = plt.subplots(
    1, 2,
    figsize=(
        mpl.rcParams["figure.figsize"][0] * 1.5,
        mpl.rcParams["figure.figsize"][1],
    )
)
for axi, dim in enumerate(range(0, 4, 2)):
    clustering.evaluate(
        ax=Ax[axi],
        dim=(dim, dim + 1),
        ax_props={"xlabel": None, "ylabel": None, "xticks": (), "yticks": ()}
    )
../_images/benchmark_scans_272_0.png
[422]:
r = 0.1
c = 40

neighbours = helper.compute_neighbours(data_concatened, r, sort=True)
neighbours_clustering = cluster.prepare_clustering(
    neighbours,
    preparation_hook=cluster.prepare_neighbourhoods,
    **neighbours_sorted_recipe
)

neighbours_clustering.fit(r, c, member_cutoff=20, v=False)
clustering._labels = neighbours_clustering._labels
[423]:
fig, Ax = plt.subplots(
    1, 2,
    figsize=(
        mpl.rcParams["figure.figsize"][0] * 1.5,
        mpl.rcParams["figure.figsize"][1],
    )
)
for axi, dim in enumerate(range(0, 4, 2)):
    clustering.evaluate(
        ax=Ax[axi],
        dim=(dim, dim + 1),
        ax_props={"xlabel": None, "ylabel": None, "xticks": (), "yticks": ()}
    )
../_images/benchmark_scans_274_0.png

Hierarchical semi-automatic

[12]:
h_clustering = cluster.prepare_clustering(
    data,
    **neighbours_sorted_alternative_recipe
)
[13]:
r = [0.5, 0.25]
c = 40

h_clustering.fit_hierarchical(r, c, member_cutoff=20)
[14]:
fig, Ax = plt.subplots(
    1, 2,
    figsize=(
        mpl.rcParams["figure.figsize"][0] * 1.5,
        mpl.rcParams["figure.figsize"][1],
    )
)
for axi, dim in enumerate(range(0, 4, 2)):
    h_clustering.evaluate(
        ax=Ax[axi],
        dim=(dim, dim + 1),
        ax_props={"xlabel": None, "ylabel": None, "xticks": (), "yticks": ()}
    )
../_images/benchmark_scans_278_0.png
[15]:
fig, ax = plt.subplots(figsize=(3.33, 3.33))
h_clustering.pie(ax=ax)
[15]:
(<Figure size 999x999 with 1 Axes>, <AxesSubplot:>)
../_images/benchmark_scans_279_1.png
[16]:
fig, ax = plt.subplots(figsize=(3.33, 1.5))
h_clustering.tree(
    ax=ax,
    pos_props={"x_spacing": 10, "y_spacing": 0.01},
    draw_props={"node_size": 100, "font_size": 6},

)
[16]:
(<Figure size 999x450 with 1 Axes>, <AxesSubplot:>)
../_images/benchmark_scans_280_1.png
[17]:
def trim_trivial(clustering):
    if clustering._labels is None:
        return

    if len(clustering._labels.mapping) == 1:
        clustering._labels = None
        return

    for child in clustering._children.values():
        trim_trivial(child)

    return
[18]:
trim_trivial(h_clustering)
[19]:
h_clustering.reel()
[20]:
fig, Ax = plt.subplots(
    1, 2,
    figsize=(
        mpl.rcParams["figure.figsize"][0] * 1.5,
        mpl.rcParams["figure.figsize"][1],
    )
)
for axi, dim in enumerate(range(0, 4, 2)):
    h_clustering.evaluate(
        ax=Ax[axi],
        dim=(dim, dim + 1),
        ax_props={"xlabel": None, "ylabel": None, "xticks": (), "yticks": ()}
    )
../_images/benchmark_scans_284_0.png

Langerin (PCA)

[290]:
scan_name = "pca"
[291]:
# The test data
data_path = data_dir / "6a5j/projections/tica_500_4.npy"
data = [p[::10, :] for p in np.load(data_path)]
[292]:
data_concatened = np.concatenate(data)
n_points = data_concatened.shape[0]

distances = pairwise_distances(data_concatened[::2])
[293]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(2, 1))
ax1.scatter(
    *data_concatened[:, :2].T,
    s=0.5,
)
ax2.scatter(
    *data_concatened[:, 2:].T,
    s=0.5,
)
plt.tight_layout(w_pad=1)
../_images/benchmark_scans_289_0.png

Checks

[294]:
fig, ax = plt.subplots()
_ = plot.plot_histogram(ax, distances.flatten(), maxima=True, maxima_props={"order": 5})
../_images/benchmark_scans_291_0.png
[295]:
h, binmids = calc_histogram(
    distances.flatten(),
    save=True,
    base_name=figsrc_dir / f"{scan_name}/{scan_name}_dist"
)
[267]:
print("Min: ", distances[distances > 0].min())
print("Max: ", distances.max())
Min:  0.005043478
Max:  6.638115
[296]:
r_array, min_n, max_n, mean_n, highlights_distributions = calc_n_neighbours_per_radius(
    distances,
    highlights=np.array([0.33, 1.83, 2.62, 3.95]),
    hist_props={"bins": 20},
    save=True,
    base_name=figsrc_dir / f"{scan_name}/{scan_name}"
    )
[271]:
plot_n_neighbours_per_radius(
    r_array, min_n * 2, max_n * 2, mean_n * 2,
    0.3, n_points
)
../_images/benchmark_scans_295_0.png