更新 k-means2

This commit is contained in:
yky 2025-06-10 23:04:46 +08:00
parent 1eaa6c38b0
commit 886ac03f75

View File

@ -1,13 +1,34 @@
from copy import deepcopy
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from matplotlib import pyplot as plt from matplotlib import pyplot as plt
plt.rcParams['figure.figsize'] = (16,9) from copy import deepcopy
# 设置图形样式
plt.rcParams['figure.figsize'] = (16, 9)
plt.style.use('ggplot') plt.style.use('ggplot')
# 创建示例数据并保存为CSV文件
def create_sample_data():
# 生成三个簇的示例数据
np.random.seed(42)
cluster1 = np.random.normal(loc=[0, 0], scale=1, size=(100, 2))
cluster2 = np.random.normal(loc=[10, 5], scale=1.5, size=(100, 2))
cluster3 = np.random.normal(loc=[5, 10], scale=1.2, size=(100, 2))
data = np.vstack([cluster1, cluster2, cluster3])
# 创建DataFrame并保存为CSV
df = pd.DataFrame(data, columns=['V1', 'V2'])
df.to_csv('xclara.csv', index=False)
# 创建示例CSV文件
create_sample_data()
# 从CSV读取数据
data = pd.read_csv('xclara.csv') data = pd.read_csv('xclara.csv')
f1 = data['V1'].values f1 = data['V1'].values
f2 = data['V2'].values f2 = data['V2'].values
X = np.array(list(zip(f1, f2))) X = np.array(list(zip(f1, f2)))
# 距离计算函数 # 距离计算函数
def dist(a, b, ax=1): def dist(a, b, ax=1):
return np.linalg.norm(a - b, axis=ax) return np.linalg.norm(a - b, axis=ax)
@ -16,33 +37,49 @@ def dist(a, b, ax=1):
k = 3 k = 3
# 随机初始化质心(修正:使用数据范围) # 随机初始化质心(修正:使用数据范围)
C_x = np.random.randint(0,np.max(X)-20, size=k) C_x = np.random.uniform(np.min(f1), np.max(f1), size=k)
C_y = np.random.randint(0,np.max(X)-20, size=k) C_y = np.random.uniform(np.min(f2), np.max(f2), size=k)
C = np.array(list(zip(C_x, C_y)), dtype=np.float32) C = np.array(list(zip(C_x, C_y)), dtype=np.float32)
C_old = np.zeros(C.shape) # 绘制初始数据点和质心(修正颜色拼写错误)
print(C) plt.scatter(f1, f2, c='black', s=7) # 修正:'balck' -> 'black'
clusters = np.zeros(len(X)) plt.scatter(C_x, C_y, marker='*', s=200, c='red')
iteration_flag = dist(C,C_old,1) plt.title("Initial Data Points and Centroids")
tmp = 1
while iteration_flag.any() != 0 and tmp<20:
for i in range(len(X)):
distances = dist(X[i],C,1)
clusters[i] = clusters
C_old = deepcopy(C)
for i in range(C):
points = [X[j] for j in range(len(X)) if clusters[j] == i]
C[i] = np.mean(points,axis=0)
print('%d'%tmp)
tmp = tmp + 1
iteraction_flag = dist(C,C_old,1)
print('distance:',iteraction_flag)
colors = ['r','g','b','y','c','m']
fig,ax = plt.subplots()
for i in range(k):
points = np.array([X[j] for j in range(len(X) if clusters[j] == i)])
ax.scatter(points[:,0],points[:,1],s=7,c=colors[i])
ax.scatter(C[:,0],C[:,1],marker="*",s=200,c='black')
plt.show() plt.show()
# ---- 可选添加完整的K-Means算法实现 ----
# 复制原始质心用于后续更新
C_old = np.zeros(C.shape)
clusters = np.zeros(len(X))
error = dist(C, C_old, None)
# K-Means迭代
while error != 0:
# 分配点到最近质心
for i in range(len(X)):
distances = dist(X[i], C)
cluster = np.argmin(distances)
clusters[i] = cluster
# 保存旧质心
C_old = deepcopy(C)
# 计算新质心
for i in range(k):
points = [X[j] for j in range(len(X)) if clusters[j] == i]
if points:
C[i] = np.mean(points, axis=0)
# 计算质心移动距离
error = dist(C, C_old, None)
# 绘制最终聚类结果
colors = ['r', 'g', 'b', 'c', 'm', 'y']
fig, ax = plt.subplots()
for i in range(k):
points = np.array([X[j] for j in range(len(X)) if clusters[j] == i])
ax.scatter(points[:, 0], points[:, 1], s=7, c=colors[i])
ax.scatter(C[:, 0], C[:, 1], marker='*', s=200, c='black')
plt.title("Final Clustering Result")
plt.show()