from copy import deepcopy import numpy as np import pandas as pd from matplotlib import pyplot as plt plt.rcParams['figure.figsize'] = (16, 9) plt.style.use('ggplot') # 创建示例数据 def create_sample_data(): np.random.seed(42) cluster1 = np.random.normal(loc=[0, 0], scale=1, size=(100, 2)) cluster2 = np.random.normal(loc=[10, 5], scale=1.5, size=(100, 2)) cluster3 = np.random.normal(loc=[5, 10], scale=1.2, size=(100, 2)) data = np.vstack([cluster1, cluster2, cluster3]) df = pd.DataFrame(data, columns=['V1', 'V2']) df.to_csv('xclara.csv', index=False) # 如果文件不存在则创建 try: data = pd.read_csv('xclara.csv') except: create_sample_data() data = pd.read_csv('xclara.csv') f1 = data['V1'].values f2 = data['V2'].values X = np.array(list(zip(f1, f2))) # 距离计算函数 def dist(a, b, ax=1): return np.linalg.norm(a - b, axis=ax) # 设置聚类数(cluster) k = 3 # 随机初始化质心 C_x = np.random.uniform(np.min(f1), np.max(f1), size=k) C_y = np.random.uniform(np.min(f2), np.max(f2), size=k) C = np.array(list(zip(C_x, C_y)), dtype=np.float32) # 绘制初始数据点 plt.scatter(f1, f2, c='black', s=7) plt.scatter(C_x, C_y, marker='*', s=200, c='red') plt.title("Initial Data Points and Centroids") plt.show() # 初始化变量 C_old = np.zeros(C.shape) clusters = np.zeros(len(X)) iteration_flag = dist(C, C_old, None) # 初始距离 tmp = 1 # K-Means循环 while iteration_flag != 0 and tmp < 20: # 1. 分配点到最近的质心,划到簇里 for i in range(len(X)): distances = dist(X[i], C, 1) # 计算点到所有质心的距离 cluster_idx = np.argmin(distances) # 找到最近的质心索引 clusters[i] = cluster_idx # 2. 保存旧质心 C_old = deepcopy(C) # 3. 更新质心位置 for i in range(k): # 获取属于当前簇的所有点 points = X[clusters == i] if len(points) > 0: C[i] = np.mean(points, axis=0) else: # 如果簇为空,重新初始化质心 C[i] = np.random.uniform(np.min(X), np.max(X), size=2) print(f'Iteration {tmp}') tmp += 1 # 4. 计算质心移动距离 iteration_flag = dist(C, C_old, None) print(f'Centroid movement distance: {iteration_flag:.4f}') # 绘制最终聚类结果 colors = ['r', 'g', 'b', 'y', 'c', 'm'] fig, ax = plt.subplots() for i in range(k): points = X[clusters == i] # 选择属于当前簇的点 ax.scatter(points[:, 0], points[:, 1], s=7, c=colors[i]) ax.scatter(C[:, 0], C[:, 1], marker="*", s=200, c='black') plt.title("Final Clustering Result") plt.show()