Module dreamtim.statistical_tools

This module contains different self-implemented statistical tools. It isn't documented good, because I am lazy ;)

Expand source code
"""This module contains different self-implemented statistical tools. It isn't documented good, because I am lazy ;)"""

from typing import Iterable, Tuple, Sequence, Dict, List

from collections import defaultdict
from random import sample
from functools import partial
from math import fsum, sqrt
from pprint import pprint

Point = Tuple[int, ...]
Centroid = Point


def transpose(data: Iterable[Iterable]):
    """Swap the rows and columns in a 2-D array of data"""
    return list(zip(*data))


def mean(data: Iterable[float]) -> float:
    """Accurate arithemetic mean"""
    data = list(data)
    return fsum(data) / len(data)


def dist(p: Point, q: Point, fsum=fsum, sqrt=sqrt, zip=zip) -> float:
    """Euclidian distance for multidemensional data"""
    return sqrt(fsum([(x - y) ** 2 for x, y in zip(p, q)]))


def assign_data(centroids: Sequence[Centroid], data: Iterable[Point]) -> Dict:
    """Group the data points to the closest centroid"""
    assigned = defaultdict(list)
    for point in data:
        closest_centroid = min(centroids, key=partial(dist, point))
        assigned[closest_centroid].append(point)
    return dict(assigned)


def compute_centroids(groups: Iterable[Sequence[Point]]) -> List[Centroid]:
    """Compute the centroid for each group"""
    return [tuple(map(mean, transpose(group))) for group in groups]


def k_means(
        data: Iterable[Point],
        k: int = 2, iterations: int = 50) -> List[Centroid]:
    data = list(data)
    centroids = sample(data, k)
    for i in range(iterations):
        labeled = assign_data(centroids, data)
        centroids = compute_centroids(labeled.values())
    return centroids


if __name__ == "main":
    points = [
        (1, 2, 3),
        (1, 2, 4),
        (2, 4, 3),
        (10, 12, 13),
        (11, 12, 13),
        (11, 13, 12)
        ]

    centroids = k_means(points, k=2)
    d = assign_data(centroids, points)
    pprint(d)

Functions

def assign_data(centroids: Sequence[Tuple[int, ...]], data: Iterable[Tuple[int, ...]]) ‑> Dict

Group the data points to the closest centroid

Expand source code
def assign_data(centroids: Sequence[Centroid], data: Iterable[Point]) -> Dict:
    """Group the data points to the closest centroid"""
    assigned = defaultdict(list)
    for point in data:
        closest_centroid = min(centroids, key=partial(dist, point))
        assigned[closest_centroid].append(point)
    return dict(assigned)
def compute_centroids(groups: Iterable[Sequence[Tuple[int, ...]]]) ‑> List[Tuple[int, ...]]

Compute the centroid for each group

Expand source code
def compute_centroids(groups: Iterable[Sequence[Point]]) -> List[Centroid]:
    """Compute the centroid for each group"""
    return [tuple(map(mean, transpose(group))) for group in groups]
def dist(p: Tuple[int, ...], q: Tuple[int, ...], fsum=<built-in function fsum>, sqrt=<built-in function sqrt>, zip=builtins.zip) ‑> float

Euclidian distance for multidemensional data

Expand source code
def dist(p: Point, q: Point, fsum=fsum, sqrt=sqrt, zip=zip) -> float:
    """Euclidian distance for multidemensional data"""
    return sqrt(fsum([(x - y) ** 2 for x, y in zip(p, q)]))
def k_means(data: Iterable[Tuple[int, ...]], k: int = 2, iterations: int = 50) ‑> List[Tuple[int, ...]]
Expand source code
def k_means(
        data: Iterable[Point],
        k: int = 2, iterations: int = 50) -> List[Centroid]:
    data = list(data)
    centroids = sample(data, k)
    for i in range(iterations):
        labeled = assign_data(centroids, data)
        centroids = compute_centroids(labeled.values())
    return centroids
def mean(data: Iterable[float]) ‑> float

Accurate arithemetic mean

Expand source code
def mean(data: Iterable[float]) -> float:
    """Accurate arithemetic mean"""
    data = list(data)
    return fsum(data) / len(data)
def transpose(data: Iterable[Iterable])

Swap the rows and columns in a 2-D array of data

Expand source code
def transpose(data: Iterable[Iterable]):
    """Swap the rows and columns in a 2-D array of data"""
    return list(zip(*data))