Source code for qlearnkit.algorithms.qkmeans.centroid_initialization
from math import floor
import numpy as np
[docs]def random(X: np.ndarray,
n_clusters: int,
random_state: int = 42) -> np.ndarray:
"""
Create random cluster centroids.
Args:
X:
The dataset to be used for centroid initialization.
n_clusters:
The desired number of clusters for which centroids are required.
random_state:
Determines random number generation for centroid initialization.
Returns:
Collection of k centroids as a numpy ndarray.
"""
np.random.seed(random_state)
centroids = []
m = np.shape(X)[0]
for _ in range(n_clusters):
r = np.random.randint(0, m - 1)
centroids.append(X[r])
return np.array(centroids)
[docs]def kmeans_plus_plus(X: np.ndarray,
k: int,
random_state: int = 42) -> np.ndarray:
"""
Create cluster centroids using the k-means++ algorithm.
Args:
X:
The dataset to be used for centroid initialization.
k:
The desired number of clusters for which centroids are required.
random_state:
Determines random number generation for centroid initialization.
Returns:
Collection of k centroids as a numpy ndarray.
"""
np.random.seed(random_state)
centroids = [X[0]]
i = 0
for _ in range(1, k):
dist_sq = np.array([min([np.inner(c - x, c - x) for c in centroids]) for x in X])
probs = dist_sq / dist_sq.sum()
cumulative_probs = probs.cumsum()
r = np.random.rand()
for j, p in enumerate(cumulative_probs):
if r < p:
i = j
break
centroids.append(X[i])
return np.array(centroids)
[docs]def naive_sharding(X: np.ndarray,
k: int) -> np.ndarray:
"""
Create cluster centroids using deterministic naive sharding algorithm.
Args:
X:
The dataset to be used for centroid initialization.
k:
The desired number of clusters for which centroids are required.
Returns:
Collection of k centroids as a numpy ndarray.
"""
n = np.shape(X)[1]
m = np.shape(X)[0]
centroids = np.zeros((k, n))
composite = np.mat(np.sum(X, axis=1))
ds = np.append(composite.T, X, axis=1)
ds.sort(axis=0)
step = floor(m / k)
vfunc = np.vectorize(lambda sums, step_: sums/step_)
for j in range(k):
if j == k - 1:
centroids[j:] = vfunc(np.sum(ds[j * step:, 1:], axis=0), step)
else:
centroids[j:] = vfunc(np.sum(ds[j * step:(j + 1) * step, 1:], axis=0), step)
return centroids