Clustering of scikit-learn toy data sets

Go to:

Notebook configuration

[1]:
import sys

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import sklearn
from sklearn import datasets
from sklearn.metrics import pairwise_distances
from sklearn.neighbors import KDTree
from sklearn.preprocessing import StandardScaler

import cnnclustering
from cnnclustering import cluster, hooks
from cnnclustering import _types, _fit
[2]:
# Optional helper functionality from the benchmark module
#     https://github.com/janjoswig/CommonNNClustering-Benchmark
sys.path.insert(0, "../benchmark")
import helper_base

Print Python and package version information:

[3]:
# Version information
print("Python: ", *sys.version.split("\n"))

print("Packages:")
for package in [mpl, np, sklearn, cnnclustering]:
    print(f"    {package.__name__}: {package.__version__}")
Python:  3.8.8 (default, Mar 11 2021, 08:58:19)  [GCC 8.3.0]
Packages:
    matplotlib: 3.3.4
    numpy: 1.20.1
    sklearn: 0.24.1
    cnnclustering: 0.4.3

We use Matplotlib to create plots. The matplotlibrc file in the root directory of the CommonNNClustering repository is used to customize the appearance of the plots.

[4]:
# Matplotlib configuration
mpl.rc_file(
    "../../matplotlibrc",
    use_default_template=False
)
[5]:
# Axis property defaults for the plots
ax_props = {
    "xlabel": None,
    "ylabel": None,
    "xlim": (-2.5, 2.5),
    "ylim": (-2.5, 2.5),
    "xticks": (),
    "yticks": (),
    "aspect": "equal"
}

# Line plot property defaults
line_props = {
    "linewidth": 0,
    "marker": '.',
}

Data set generation

To see the common-nearest-neighbours (CommonNN) clustering in action, let’s have a look at a handful of basic 2D data sets from scikit-learn (like here in the scikit-learn documentation). We will cluster the data sets starting from different input data formats using the provided defaults. For more details see also the Advanced usage tutorial.

[6]:
# Data set generation parameters
np.random.seed(0)
n_samples = 2000
[7]:
# Data set generation
# Fit all datasets to the same value range
#    using `data = StandardScaler().fit_transform(data)`

# circles
noisy_circles, _ = datasets.make_circles(
    n_samples=n_samples,
    factor=.5,
    noise=.05
    )
noisy_circles = StandardScaler().fit_transform(noisy_circles)

# moons
noisy_moons, _ = datasets.make_moons(
    n_samples=n_samples,
    noise=.05
    )
noisy_moons = StandardScaler().fit_transform(noisy_moons)

# blobs
blobs, _ = datasets.make_blobs(
    n_samples=n_samples,
    random_state=8
    )
blobs = StandardScaler().fit_transform(blobs)

# None
no_structure = np.random.rand(
    n_samples, 2
    )
no_structure = StandardScaler().fit_transform(no_structure)

# aniso
random_state = 170
X, y = datasets.make_blobs(
    n_samples=n_samples,
    random_state=random_state
    )

transformation = [[0.6, -0.6], [-0.4, 0.8]]
aniso = np.dot(X, transformation)
aniso = StandardScaler().fit_transform(aniso)

# varied
varied, _ = datasets.make_blobs(
    n_samples=n_samples,
    cluster_std=[1.0, 2.5, 0.5],
    random_state=random_state
    )
varied = StandardScaler().fit_transform(varied)
[8]:
# Define cluster parameters
dsets = [ # "name", set, **parameters
    ('circles', noisy_circles, {
        'radius_cutoff': 0.5,
        'cnn_cutoff': 20,
        'member_cutoff': 100,
        'max_clusters': None
        }),
    ('moons', noisy_moons, {
        'radius_cutoff': 0.5,
        'cnn_cutoff': 20,
        'member_cutoff': 2,
        'max_clusters': None
        }),
    ('varied', varied, {
        'radius_cutoff': 0.28,
        'cnn_cutoff': 20,
        'member_cutoff': 20,
        'max_clusters': None
        }),
    ('aniso', aniso, {
        'radius_cutoff': 0.29,
        'cnn_cutoff': 30,
        'member_cutoff': 5,
        'max_clusters': None
         }),
    ('blobs', blobs, {
        'radius_cutoff': 0.4,
        'cnn_cutoff': 20,
        'member_cutoff': 2,
        'max_clusters': None
        }),
    ('None', no_structure, {
        'radius_cutoff': 0.5,
        'cnn_cutoff': 20,
        'member_cutoff': 1,
        'max_clusters': None
        }),
    ]
[9]:
# Plot the original data sets
fig, ax = plt.subplots(2, 3)
Ax = ax.flatten()

for count, (name, data, *_) in enumerate(dsets):
    Ax[count].plot(data[:, 0], data[:, 1], **line_props)
    Ax[count].set(**ax_props)
    Ax[count].set_title(f'{name}', fontsize=10, pad=4)

fig.subplots_adjust(
    left=0, right=1, bottom=0, top=1, wspace=0.1, hspace=0.3
    )
../_images/tutorial_scikit_learn_datasets_15_0.png

CommonNN clustering using data point coordiantes as input

[10]:
fig, ax = plt.subplots(2, 3)
Ax = ax.flatten()

for count, (name, data, params) in enumerate(dsets):

    # Default clustering initialisation
    clustering = cluster.Clustering(data)

    # Calculate neighbours brute force
    clustering.fit(**params)
    print()

    clustering.evaluate(ax=Ax[count], annotate_pos="random")
    Ax[count].set(**ax_props)
    Ax[count].set_title(f'{name}', fontsize=10, pad=4)

fig.subplots_adjust(
    left=0, right=1, bottom=0, top=1, wspace=0.1, hspace=0.3
    )
-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.500     20        100       None      2         0.500     0.000     00:00:0.162
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.500     20        2         None      2         0.500     0.000     00:00:0.155
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.280     20        20        None      3         0.338     0.114     00:00:0.205
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.290     30        5         None      3         0.319     0.050     00:00:0.159
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.400     20        2         None      3         0.334     0.001     00:00:0.269
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.500     20        1         None      1         1.000     0.000     00:00:0.135
-----------------------------------------------------------------------------------------------


../_images/tutorial_scikit_learn_datasets_17_1.png

When raw input data is presented to a Clustering on creation without specifying anything else, the obtained Clustering object aggregates the needed clustering components, assuming that it got point coordinates as a sequence of sequences. The neighbourhood of a specific point will be collected brute-force by computing the (Euclidean) distances to all other points and comparing them to the radius cutoff. This will be the slowest possible approach but it has fairly conservative memory usage.

[11]:
# Clustering components used by default ("coordinates" recipe)
print(helper_base.indent_at_parens(str(clustering)))
Clustering(
    input_data=InputDataExtComponentsMemoryview,
    fitter=FitterExtBFS(
        ngetter=NeighboursGetterExtBruteForce(
            dgetter=DistanceGetterExtMetric(
                metric=MetricExtEuclideanReduced
                ),
            sorted=False,
            selfcounting=True
            ),
        na=NeighboursExtVectorCPPUnorderedSet,
        nb=NeighboursExtVectorCPPUnorderedSet,
        checker=SimilarityCheckerExtSwitchContains,
        queue=QueueExtFIFOQueue
        ),
    predictor=None
    )

CommonNN clustering with pre-computed distances

[12]:
fig, ax = plt.subplots(2, 3)
Ax = ax.flatten()

for count, (name, data, params) in enumerate(dsets):

    # Default clustering initialisation
    clustering = cluster.Clustering(data)

    # Pre-compute distances and choose the corresponding recipe
    distances = pairwise_distances(data)
    clustering_dist = cluster.Clustering(
        distances,
        registered_recipe_key="distances"
        )

    # Use pre-computed distances
    clustering_dist.fit(**params)
    clustering._labels = clustering_dist._labels
    print()

    clustering.evaluate(ax=Ax[count], annotate_pos="random")
    Ax[count].set(**ax_props)
    Ax[count].set_title(f'{name}', fontsize=10, pad=4)

fig.subplots_adjust(
    left=0, right=1, bottom=0, top=1, wspace=0.1, hspace=0.3
    )
-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.500     20        100       None      2         0.500     0.000     00:00:0.077
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.500     20        2         None      2         0.500     0.000     00:00:0.093
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.280     20        20        None      3         0.338     0.114     00:00:0.118
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.290     30        5         None      3         0.319     0.050     00:00:0.087
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.400     20        2         None      3         0.334     0.001     00:00:0.201
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.500     20        1         None      1         1.000     0.000     00:00:0.074
-----------------------------------------------------------------------------------------------


../_images/tutorial_scikit_learn_datasets_21_1.png

When raw input data is presented to a Clustering on creation in terms of a dense distance matrix, the “distances” recipe can be chosen to make the obtained Clustering object aggregate the needed clustering components. The neighbourhood of a specific point will be collected brute-force by looking up distances from the input data. This will be somewhat faster as no distances need to be calculated during the clustering but it has a high memory demand. It also allows to leverage smart external methods for the task of the distance calculation.

[13]:
# Clustering components used by the "distance" recipe
print(helper_base.indent_at_parens(str(clustering_dist)))
Clustering(
    input_data=InputDataExtComponentsMemoryview,
    fitter=FitterExtBFS(
        ngetter=NeighboursGetterExtBruteForce(
            dgetter=DistanceGetterExtMetric(
                metric=MetricExtPrecomputed
                ),
            sorted=False,
            selfcounting=True
            ),
        na=NeighboursExtVectorCPPUnorderedSet,
        nb=NeighboursExtVectorCPPUnorderedSet,
        checker=SimilarityCheckerExtSwitchContains,
        queue=QueueExtFIFOQueue
        ),
    predictor=None
    )

CommonNN clustering with pre-computed neighbourhoods

[14]:
fig, ax = plt.subplots(2, 3)
Ax = ax.flatten()

for count, (name, data, params) in enumerate(dsets):

    clustering = cluster.Clustering(data)

    # Pre-compute neighbourhoods and choose the corresponding recipe
    tree = KDTree(data)
    neighbourhoods = tree.query_radius(
        data, r=params["radius_cutoff"], return_distance=False
        )

    clustering_neighbourhoods = cluster.Clustering(
        neighbourhoods,
        preparation_hook=hooks.prepare_neighbourhoods,
        registered_recipe_key="neighbourhoods"
        )

    # Use pre-computed neighbourhoods
    clustering_neighbourhoods.fit(**params)
    clustering._labels = clustering_neighbourhoods._labels
    print()

    clustering.evaluate(ax=Ax[count], annotate_pos="random")
    Ax[count].set(**ax_props)
    Ax[count].set_title(f'{name}', fontsize=10, pad=4)

fig.subplots_adjust(
    left=0, right=1, bottom=0, top=1, wspace=0.1, hspace=0.3
    )
-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.500     20        100       None      2         0.500     0.000     00:00:0.036
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.500     20        2         None      2         0.500     0.000     00:00:0.052
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.280     20        20        None      3         0.338     0.114     00:00:0.064
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.290     30        5         None      3         0.319     0.050     00:00:0.039
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.400     20        2         None      3         0.334     0.001     00:00:0.149
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.500     20        1         None      1         1.000     0.000     00:00:0.032
-----------------------------------------------------------------------------------------------


../_images/tutorial_scikit_learn_datasets_25_1.png

When raw input data is presented to a Clustering on creation in terms of pre-computed neighbourhoods, the “neighbourhoods” recipe can be chosen to make the obtained Clustering object aggregate the needed clustering components. The neighbourhood of a specific point will looked up from the input data. This will be considerably faster and it has a lower memory demand compared to the “distances” case. It also allows to leverage smart external methods for the task of the neighbourhood calculation.

Note, that the “neighbourhoods” recipe requires the raw neighbourhoods in the concrete format of a matrix in which the neighbourhoods of individual points are padded to the length of the largest neighbourhood. If neighbourhoods are presented in terms of a sequence of sequences (with varying length) as one would get from a tree query to sklearn.neighbors.KDTree, this can be transformed into the suitable format using the preparation hook hooks.prepare_neighbourhoods.

[15]:
# Clustering components used by the "distance" recipe
print(helper_base.indent_at_parens(str(clustering_neighbourhoods)))
Clustering(
    input_data=InputDataExtNeighbourhoodsMemoryview,
    fitter=FitterExtBFS(
        ngetter=NeighboursGetterExtLookup(
            sorted=False,
            selfcounting=True
            ),
        na=NeighboursExtVectorCPPUnorderedSet,
        nb=NeighboursExtVectorCPPUnorderedSet,
        checker=SimilarityCheckerExtSwitchContains,
        queue=QueueExtFIFOQueue
        ),
    predictor=None
    )

Common-nearest-neighbours clustering with pre-computed sorted neighbourhoods

[16]:
fig, ax = plt.subplots(2, 3)
Ax = ax.flatten()

for count, (name, data, params) in enumerate(dsets):

    clustering = cluster.Clustering(data)

    # Pre-compute and sort neighbourhoods and choose the corresponding recipe
    tree = KDTree(data)
    neighbourhoods = tree.query_radius(
        data, r=params["radius_cutoff"], return_distance=False
        )

    for n in neighbourhoods:
        n.sort()

    # Custom initialisation using a clustering builder
    clustering_neighbourhoods_sorted = cluster.Clustering(
        neighbourhoods,
        preparation_hook=hooks.prepare_neighbourhoods,
        registered_recipe_key="sorted_neighbourhoods"
        )

    # Use pre-computed neighbourhoods
    clustering_neighbourhoods_sorted.fit(**params)
    clustering._labels = clustering_neighbourhoods_sorted._labels
    print()

    clustering.evaluate(ax=Ax[count], annotate_pos="random")
    Ax[count].set(**ax_props)
    Ax[count].set_title(f'{name}', fontsize=10, pad=4)

fig.subplots_adjust(
    left=0, right=1, bottom=0, top=1, wspace=0.1, hspace=0.3
    )
-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.500     20        100       None      2         0.500     0.000     00:00:0.003
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.500     20        2         None      2         0.500     0.000     00:00:0.004
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.280     20        20        None      3         0.338     0.114     00:00:0.006
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.290     30        5         None      3         0.319     0.050     00:00:0.004
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.400     20        2         None      3         0.334     0.001     00:00:0.011
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.500     20        1         None      1         1.000     0.000     00:00:0.004
-----------------------------------------------------------------------------------------------


../_images/tutorial_scikit_learn_datasets_29_1.png

Pre-computed neighbourhood information can be additionally sorted. While this has the same memory demand as using unsorted neighbourhoods, it offers the fastest clustering.

[17]:
# Clustering components used by the "distance" recipe
print(helper_base.indent_at_parens(str(clustering_neighbourhoods_sorted)))
Clustering(
    input_data=InputDataExtNeighbourhoodsMemoryview,
    fitter=FitterExtBFS(
        ngetter=NeighboursGetterExtLookup(
            sorted=True,
            selfcounting=True
            ),
        na=NeighboursExtVector,
        nb=NeighboursExtVector,
        checker=SimilarityCheckerExtScreensorted,
        queue=QueueExtFIFOQueue
        ),
    predictor=None
    )