Update README.md

Signed-off-by: David Rotermund <54365609+davrot@users.noreply.github.com>
2025-07-08 08:00:02 +02:00 · 2023-12-19 14:37:55 +01:00 · 2023-12-19 14:37:55 +01:00 · c74c3e120c
commit c74c3e120c
parent f99563f1c8
1 changed files with 108 additions and 1 deletions
--- a/scikit-learn/kmeans/README.md
+++ b/scikit-learn/kmeans/README.md
@ -72,7 +72,7 @@ data_b = np.concatenate((b_x, b_y), axis=1)

 data = np.concatenate((data_a, data_b), axis=0)

-kmeans = KMeans(n_clusters=2)
+kmeans = KMeans(n_clusters=2, n_init = 10)
 kmeans.fit(data)


@ -90,4 +90,111 @@ plt.show()

 ![image1](image1.png)

+> **labels_** : ndarray of shape (n_samples,)
+>     Labels of each point

+## What does the algorithm „think“ where the data points belong?
+
+```python
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.cluster import KMeans
+
+rng = np.random.default_rng(1)
+
+a_x = rng.normal(1.5, 1.0, size=(1000))[:, np.newaxis]
+a_y = rng.normal(3.0, 1.0, size=(1000))[:, np.newaxis]
+data_a = np.concatenate((a_x, a_y), axis=1)
+
+b_x = rng.normal(0.0, 1.0, size=(1000))[:, np.newaxis]
+b_y = rng.normal(0.0, 1.0, size=(1000))[:, np.newaxis]
+data_b = np.concatenate((b_x, b_y), axis=1)
+
+data = np.concatenate((data_a, data_b), axis=0)
+
+kmeans = KMeans(n_clusters=2, n_init = 10)
+kmeans.fit(data)
+
+labels = kmeans.labels_
+idx_0 = np.where(labels == 0)[0]
+idx_1 = np.where(labels == 1)[0]
+
+plt.plot(data[idx_0, 0], data[idx_0, 1], "r.")
+plt.plot(data[idx_1, 0], data[idx_1, 1], "b.")
+plt.plot(
+    kmeans.cluster_centers_[0, 0], kmeans.cluster_centers_[0, 1], "k*", markersize=12
+)
+plt.plot(
+    kmeans.cluster_centers_[1, 0], kmeans.cluster_centers_[1, 1], "k*", markersize=12
+)
+
+plt.show()
+```
+
+![image2](image2.png)
+
+
+## [predict](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans.predict)
+
+```python
+predict(X, sample_weight='deprecated')
+```
+
+> Predict the closest cluster each sample in X belongs to.
+> 
+> In the vector quantization literature, cluster\_centers\_ is called the code book and each value returned by predict is the index of the closest code in the code book.
+
+```python
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.cluster import KMeans
+
+rng = np.random.default_rng(1)
+
+a_x = rng.normal(1.5, 1.0, size=(1000))[:, np.newaxis]
+a_y = rng.normal(3.0, 1.0, size=(1000))[:, np.newaxis]
+data_a = np.concatenate((a_x, a_y), axis=1)
+
+b_x = rng.normal(0.0, 1.0, size=(1000))[:, np.newaxis]
+b_y = rng.normal(0.0, 1.0, size=(1000))[:, np.newaxis]
+data_b = np.concatenate((b_x, b_y), axis=1)
+
+data = np.concatenate((data_a, data_b), axis=0)
+
+kmeans = KMeans(n_clusters=2, n_init=10)
+kmeans.fit(data)
+
+
+x = np.linspace(data[:, 0].min(), data[:, 0].max(), 100)
+y = np.linspace(data[:, 1].min(), data[:, 1].max(), 100)
+xx, yy = np.meshgrid(x, y)
+
+xx_r = xx.ravel()[:, np.newaxis]
+yy_r = yy.ravel()[:, np.newaxis]
+
+print(xx.shape)  # -> (100, 100)
+print(xx_r.shape)  # -> (10000, 1)
+print(yy.shape)  # -> (100, 100)
+print(yy_r.shape)  # -> (10000, 1)
+
+coordinates = np.concatenate((xx_r, yy_r), axis=1)
+print(coordinates.shape)  # -> (10000, 2)
+
+labels = kmeans.predict(coordinates)
+idx_0 = np.where(labels == 0)[0]
+idx_1 = np.where(labels == 1)[0]
+
+
+plt.plot(coordinates[idx_0, 0], coordinates[idx_0, 1], "r.")
+plt.plot(coordinates[idx_1, 0], coordinates[idx_1, 1], "b.")
+plt.plot(
+    kmeans.cluster_centers_[0, 0], kmeans.cluster_centers_[0, 1], "k*", markersize=12
+)
+plt.plot(
+    kmeans.cluster_centers_[1, 0], kmeans.cluster_centers_[1, 1], "k*", markersize=12
+)
+
+plt.show()
+```
+
+![image3](image3.png)