""" Complexity Index
Complexity index (Janson2004_, Rapp2007_) computes the `l`-subword complexity (`l`-subword spectrum) of a one-dimensional,
symbolic (integer) time series, by finding the number of distinct subwords
of length `l`. The total complexity is given by the sum of all subwords of different lengths.
The unnormalized measure can be used to compare sequences of equal lengths.
Notes
-----
* This is a direct translation from the `Complexity toolbox available at` http://users.auth.gr/~stdimitr/files/software/complexitiy.rar
* Original author is Stravros Dimitriadis <stidimitriadis@gmail.com>
|
.. [Janson2004] Janson, S., Lonardi, S., & Szpankowski, W. (2004). On average sequence complexity. Theoretical Computer Science, 326(1-3), 213-227.
.. [Rapp2007] Rapp, P. E. (2007). Quantitative characterization of animal behavior following blast exposure. Cognitive neurodynamics, 1(4), 287-293.
"""
# Author: Avraam Marimpis <avraam.marimpis@gmail.com>
import numpy as np
[docs]def complexity_index(x, sub_len=-1, normalize=False, iterations=100):
""" Complexity Index
Parameters
----------
x : array-like, shape(n_samples)
Input symbolic time series.
sub_len : int
Maximum subword length. Default is `len(x) - 1`.
normalize : bool
Normalize result. Default is `False`.
iterationss : int
Number of iterations to perform randomization. Default is `100`.
Returns
-------
normal_ci : float
The computed omplexity index after normalization against the randomization
procedure.
ci : float
The computed complexity index.
spectrum : array-like
A list of the number of distinct subwords of length 1, up to the size
of the input symbolic time series.
"""
x = np.int32(x)
x = x.flatten()
len_x = len(x)
ci, spectrum = __compute_complexity_index(x, sub_len)
if normalize:
rng = np.random.RandomState(0)
mean_ci = 0.0
num_letters = spectrum[0]
for _ in range(iterations):
new_x = np.int32(np.floor(rng.rand(len_x) * num_letters))
new_ci, _ = __compute_complexity_index(new_x, sub_len)
mean_ci += new_ci / iterations
normal_ci = ci / mean_ci
return normal_ci, ci, spectrum
else:
return ci, spectrum
def __compute_complexity_index(x, sub_len=-1):
""" Complexity Index
Parameters
----------
x :
Input symbolic time series.
sub_len : int
Maximum subword length. Default is `len(x) - 1`.
Returns
-------
ci : float
The computed complexity index.
spectrum : array-like
A list of the number of distinct subwords of length 1, up to the size
of the input symbolic time series.
"""
ci = 0.0
x = np.int32(x)
x = x.flatten()
len_x = len(x)
max_subword_len = len_x - 1
if sub_len >= 2:
max_subword_len = sub_len
min_x = np.min(x)
x = x - min_x
letters = np.unique(x)
max_len_word = np.min([max_subword_len, len_x - 1])
spectrum = np.ones((max_len_word))
spectrum[0] = len(letters)
all_num_words = list()
for word_len in range(1, max_len_word):
real_word_len = word_len + 1
cumulative_words = None
for shift in range(real_word_len):
num_words = np.int32(np.floor((len_x - shift) / real_word_len))
all_num_words.append(num_words)
if num_words > 0:
idx1 = shift
idx2 = real_word_len * num_words + shift
sliced = x[idx1:idx2]
words = np.reshape(sliced, (num_words, real_word_len)).T
if cumulative_words is None:
cumulative_words = words
else:
cumulative_words = np.hstack([cumulative_words, words])
conv_cumulative_words = __rowsBaseConv(cumulative_words.T)
u_cumulative_words = np.unique(conv_cumulative_words)
spectrum[word_len] = len(u_cumulative_words)
cumulative_words = None
ci = np.sum(spectrum)
all_num_words = np.array(all_num_words).flatten()
return ci, spectrum
def __rowsBaseConv(x, base=None):
"""
Parameters
----------
x :
base : integer
Returns
-------
"""
if base is None:
base = np.max(x) + 1
_, p = np.shape(x)
bases = np.ones(p) * base
indices = list(range(p - 1, -1, -1))
base = np.power(bases, indices)
result = x.dot(base)
return result