diff --git a/k-means2 b/k-means2 index bbdd8ee..cac9d7a 100644 --- a/k-means2 +++ b/k-means2 @@ -1,13 +1,34 @@ -from copy import deepcopy import numpy as np import pandas as pd from matplotlib import pyplot as plt -plt.rcParams['figure.figsize'] = (16,9) +from copy import deepcopy + +# 设置图形样式 +plt.rcParams['figure.figsize'] = (16, 9) plt.style.use('ggplot') + +# 创建示例数据并保存为CSV文件 +def create_sample_data(): + # 生成三个簇的示例数据 + np.random.seed(42) + cluster1 = np.random.normal(loc=[0, 0], scale=1, size=(100, 2)) + cluster2 = np.random.normal(loc=[10, 5], scale=1.5, size=(100, 2)) + cluster3 = np.random.normal(loc=[5, 10], scale=1.2, size=(100, 2)) + data = np.vstack([cluster1, cluster2, cluster3]) + + # 创建DataFrame并保存为CSV + df = pd.DataFrame(data, columns=['V1', 'V2']) + df.to_csv('xclara.csv', index=False) + +# 创建示例CSV文件 +create_sample_data() + +# 从CSV读取数据 data = pd.read_csv('xclara.csv') f1 = data['V1'].values f2 = data['V2'].values X = np.array(list(zip(f1, f2))) + # 距离计算函数 def dist(a, b, ax=1): return np.linalg.norm(a - b, axis=ax) @@ -16,33 +37,49 @@ def dist(a, b, ax=1): k = 3 # 随机初始化质心(修正:使用数据范围) -C_x = np.random.randint(0,np.max(X)-20, size=k) -C_y = np.random.randint(0,np.max(X)-20, size=k) +C_x = np.random.uniform(np.min(f1), np.max(f1), size=k) +C_y = np.random.uniform(np.min(f2), np.max(f2), size=k) C = np.array(list(zip(C_x, C_y)), dtype=np.float32) -C_old = np.zeros(C.shape) -print(C) -clusters = np.zeros(len(X)) -iteration_flag = dist(C,C_old,1) -tmp = 1 -while iteration_flag.any() != 0 and tmp<20: - for i in range(len(X)): - distances = dist(X[i],C,1) - clusters[i] = clusters - C_old = deepcopy(C) - for i in range(C): - points = [X[j] for j in range(len(X)) if clusters[j] == i] - C[i] = np.mean(points,axis=0) +# 绘制初始数据点和质心(修正颜色拼写错误) +plt.scatter(f1, f2, c='black', s=7) # 修正:'balck' -> 'black' +plt.scatter(C_x, C_y, marker='*', s=200, c='red') +plt.title("Initial Data Points and Centroids") +plt.show() - - print('%d'%tmp) - tmp = tmp + 1 - iteraction_flag = dist(C,C_old,1) - print('distance:',iteraction_flag) -colors = ['r','g','b','y','c','m'] -fig,ax = plt.subplots() +# ---- 可选:添加完整的K-Means算法实现 ---- +# 复制原始质心用于后续更新 +C_old = np.zeros(C.shape) +clusters = np.zeros(len(X)) +error = dist(C, C_old, None) + +# K-Means迭代 +while error != 0: + # 分配点到最近质心 + for i in range(len(X)): + distances = dist(X[i], C) + cluster = np.argmin(distances) + clusters[i] = cluster + + # 保存旧质心 + C_old = deepcopy(C) + + # 计算新质心 + for i in range(k): + points = [X[j] for j in range(len(X)) if clusters[j] == i] + if points: + C[i] = np.mean(points, axis=0) + + # 计算质心移动距离 + error = dist(C, C_old, None) + +# 绘制最终聚类结果 +colors = ['r', 'g', 'b', 'c', 'm', 'y'] +fig, ax = plt.subplots() for i in range(k): - points = np.array([X[j] for j in range(len(X) if clusters[j] == i)]) - ax.scatter(points[:,0],points[:,1],s=7,c=colors[i]) -ax.scatter(C[:,0],C[:,1],marker="*",s=200,c='black') -plt.show() \ No newline at end of file + points = np.array([X[j] for j in range(len(X)) if clusters[j] == i]) + ax.scatter(points[:, 0], points[:, 1], s=7, c=colors[i]) +ax.scatter(C[:, 0], C[:, 1], marker='*', s=200, c='black') +plt.title("Final Clustering Result") +plt.show() +