From c74c3e120c8da442c89347bb5474fa4e2a0af2b9 Mon Sep 17 00:00:00 2001 From: David Rotermund <54365609+davrot@users.noreply.github.com> Date: Tue, 19 Dec 2023 14:37:55 +0100 Subject: [PATCH] Update README.md Signed-off-by: David Rotermund <54365609+davrot@users.noreply.github.com> --- scikit-learn/kmeans/README.md | 109 +++++++++++++++++++++++++++++++++- 1 file changed, 108 insertions(+), 1 deletion(-) diff --git a/scikit-learn/kmeans/README.md b/scikit-learn/kmeans/README.md index 59c661d..ccb6f01 100644 --- a/scikit-learn/kmeans/README.md +++ b/scikit-learn/kmeans/README.md @@ -72,7 +72,7 @@ data_b = np.concatenate((b_x, b_y), axis=1) data = np.concatenate((data_a, data_b), axis=0) -kmeans = KMeans(n_clusters=2) +kmeans = KMeans(n_clusters=2, n_init = 10) kmeans.fit(data) @@ -90,4 +90,111 @@ plt.show() ![image1](image1.png) +> **labels_** : ndarray of shape (n_samples,) +> Labels of each point +## What does the algorithm „think“ where the data points belong?​ + +```python +import numpy as np +import matplotlib.pyplot as plt +from sklearn.cluster import KMeans + +rng = np.random.default_rng(1) + +a_x = rng.normal(1.5, 1.0, size=(1000))[:, np.newaxis] +a_y = rng.normal(3.0, 1.0, size=(1000))[:, np.newaxis] +data_a = np.concatenate((a_x, a_y), axis=1) + +b_x = rng.normal(0.0, 1.0, size=(1000))[:, np.newaxis] +b_y = rng.normal(0.0, 1.0, size=(1000))[:, np.newaxis] +data_b = np.concatenate((b_x, b_y), axis=1) + +data = np.concatenate((data_a, data_b), axis=0) + +kmeans = KMeans(n_clusters=2, n_init = 10) +kmeans.fit(data) + +labels = kmeans.labels_ +idx_0 = np.where(labels == 0)[0] +idx_1 = np.where(labels == 1)[0] + +plt.plot(data[idx_0, 0], data[idx_0, 1], "r.") +plt.plot(data[idx_1, 0], data[idx_1, 1], "b.") +plt.plot( + kmeans.cluster_centers_[0, 0], kmeans.cluster_centers_[0, 1], "k*", markersize=12 +) +plt.plot( + kmeans.cluster_centers_[1, 0], kmeans.cluster_centers_[1, 1], "k*", markersize=12 +) + +plt.show() +``` + +![image2](image2.png) + + +## [predict](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans.predict) + +```python +predict(X, sample_weight='deprecated') +``` + +> Predict the closest cluster each sample in X belongs to. +> +> In the vector quantization literature, cluster\_centers\_ is called the code book and each value returned by predict is the index of the closest code in the code book. + +```python +import numpy as np +import matplotlib.pyplot as plt +from sklearn.cluster import KMeans + +rng = np.random.default_rng(1) + +a_x = rng.normal(1.5, 1.0, size=(1000))[:, np.newaxis] +a_y = rng.normal(3.0, 1.0, size=(1000))[:, np.newaxis] +data_a = np.concatenate((a_x, a_y), axis=1) + +b_x = rng.normal(0.0, 1.0, size=(1000))[:, np.newaxis] +b_y = rng.normal(0.0, 1.0, size=(1000))[:, np.newaxis] +data_b = np.concatenate((b_x, b_y), axis=1) + +data = np.concatenate((data_a, data_b), axis=0) + +kmeans = KMeans(n_clusters=2, n_init=10) +kmeans.fit(data) + + +x = np.linspace(data[:, 0].min(), data[:, 0].max(), 100) +y = np.linspace(data[:, 1].min(), data[:, 1].max(), 100) +xx, yy = np.meshgrid(x, y) + +xx_r = xx.ravel()[:, np.newaxis] +yy_r = yy.ravel()[:, np.newaxis] + +print(xx.shape) # -> (100, 100) +print(xx_r.shape) # -> (10000, 1) +print(yy.shape) # -> (100, 100) +print(yy_r.shape) # -> (10000, 1) + +coordinates = np.concatenate((xx_r, yy_r), axis=1) +print(coordinates.shape) # -> (10000, 2) + +labels = kmeans.predict(coordinates) +idx_0 = np.where(labels == 0)[0] +idx_1 = np.where(labels == 1)[0] + + +plt.plot(coordinates[idx_0, 0], coordinates[idx_0, 1], "r.") +plt.plot(coordinates[idx_1, 0], coordinates[idx_1, 1], "b.") +plt.plot( + kmeans.cluster_centers_[0, 0], kmeans.cluster_centers_[0, 1], "k*", markersize=12 +) +plt.plot( + kmeans.cluster_centers_[1, 0], kmeans.cluster_centers_[1, 1], "k*", markersize=12 +) + +plt.show() +``` + +![image3](image3.png)