Академический Документы
Профессиональный Документы
Культура Документы
import pandas as pd
import math
import numpy as np
from matplotlib import pyplot as plt
k=2
centroids={}
l=[]
#print(str(finaltable))
for i in range(k):
centroids[i]=int(random.uniform(data['User_ID'].agg(pd.np.min),
data['User_ID'].agg(pd.np.max))),int(random.uniform(data['Purchase'].agg(pd.np.min)
, data['Purchase'].agg(pd.np.max)))
l.append([])
#purchpuser = data.groupby('User_ID')['Purchase'].nunique()
meanpurchase = data['Purchase'].mean()
sd =data['Purchase'].std()
finalsizec1 = 0
finalsizec2 = 0
#print(purchpuser)
print(centroids[0])
print(centroids[1])
change=True
while change==True:
for j in finaltable['User_ID'].index:
best=[]
list1 = []
for t in range(len(centroids)):
euclidean1 = pow(int(data["User_ID"][j])-int(centroids[t][0]), 2)
euclidean2 = pow(int(data["Purchase"][j]) - int(centroids[t][1]), 2)
mean=math.sqrt(euclidean1+euclidean2)/k
best.append(mean)
list1.append((data["User_ID"][j],data["Purchase"][j]))
l[best.index(min(best))].append(list1)
clustero = []
clustert = []
for z in range(k):
array1=[]
array2=[]
first=centroids[z][0]
secound=centroids[z][1]
for v in range(len(l[z])):
array1.append(l[z][v][0][0])
array2.append(l[z][v][0][1])
if (z == 0):#first centroid
clustero.append(l[z][v][0])
elif(z==1):#second centroid
clustert.append(l[z][v][0])
#print(len(clustero))
#print(len(clustert))
centroids[z]=(sum(array1) / float(len(array1))),
(sum(array2)/float(len(array2)))
#if both centroids don't change we stop
elif(z==1 and first==sum(array1) / float(len(array1)) and
secound==sum(array2) / float(len(array2))):
data = np.array(clustero)
data2 = np.array(clustert)
finalsizec1=len(data)
finalsizec2=len(data2)
x, y = data.T
x1, y1 = data2.T
plt.scatter(x, y)
plt.scatter(x1, y1)
plt.show()
plt.savefig("kmeans.pdf")
change=False
l.clear()
for x in range(k):
l.append([])