Вы находитесь на странице: 1из 8

Name: Srikanth Mujjiga (srikanth.mujjiga@students.iiit.ac.

in)
Roll No: 2015-50-831

Assignment #1: K Nearest Neighbor classifier


1. Introduction
I have used python with pandas,numpy and matplotlib packages to implement KNN
classifier.
The cross validation via N folds is also implemented.
Two different distance functions, Euclidian distance and the Hamming distance functions are
implemented.
The input data had to be shuffled as there is an inherent ordering to the data sets available.
The classifier is parameterized via the K value, distance function to be used.
o Signature: knn(trainX,trainY,predictX,k=1,df='ed')
Below are the functions and a corresponding short description
euclidianDistance

Calculates Euclidian distance between two


feature vectors
Calculates Hamming distance between two
feature vectors
An iterator which returns training set indices
and validation set indices
Runs knn on the training set predicts the
class for the validation set
The knn classifier
Helper function to plot the results
The generic test function
One function for each of the four data sets.
Each of them loads the data, cleanup the
data and shuffles the data.

hammingDistance
Kfoldvalidation
knnWithNfolds
Knn
plotThem
test
test_1,test_2,test_3,test_4

2. Data set #1: breast-cancer-wisconsin data

Url

Features
Instances
Number of Classes
Best mean Accuracy
Distance Function
Recommend K value (from observations)

http://archive.ics.uci.edu/ml/machine-learningdatabases/breast-cancer-wisconsin/breastcancer-wisconsin.data
9 (All numerical between 1 10)
699 (Some features missing)
2
0.97
Euclidian Distance
3

3. Data set #2: iris data


Url
Features
Instances
Number of Classes
Best mean Accuracy
Distance Function
Recommend K value (from observations)

http://archive.ics.uci.edu/ml/machine-learningdatabases/iris/iris.data
4 (All numerical)
150 (No missing features)
3
0.92
Euclidian Distance
2

4. Data set #3: wine data


Url
Features
Instances
Number of Classes
Best mean Accuracy
Distance Function
Recommend K value (from observations)

5. Data set #3: balance scale data


Url
Features
Instances
Number of Classes
Best mean Accuracy
Distance Function
Recommend K value (from observations)

http://archive.ics.uci.edu/ml/machine-learningdatabases/wine/wine.data
13 (All numerical)
178 (No missing features)
3
0.74
Euclidian Distance
4

http://archive.ics.uci.edu/ml/machine-learningdatabases/balance-scale/balance-scale.data
4 (Categorical)
625 (No missing features)
3
0.78
Euclidian Distance and Hamming distance
2

Since the features are categorical, I have initially used hamming distance. However hamming
distance returned bad mean accuracy. Running the knn with Euclidian distance returned much
better accuracy.

6. Code
"""
Created on Tue Aug 18 21:30:26 2015
@author: Srikanth Mujjiga (smujjiga@outlook.com)
"""
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

os.chdir(r'G:\smujjiga\SM in AI\sminai')
def euclidianDistance(x1,x2):
return np.linalg.norm(x1-x2)
#closer points are near to 0
def hammingDistance(x1,x2):
return sum([0 if a == b else 1 for a,b in zip(x1,x2)])
#n: Observation d: Features
#trainX = (nXd) trainY = (nX1) predictx = (1Xd)
def knn(trainX,trainY,predictX,k=1,df='ed'):
distance = 0
predictx = np.array(predictX[0:1].values[0],dtype=long)
#Distance function: Hamming Distance
if df == 'hd':
distance = trainX.apply(lambda x: hammingDistance(np.array(x,dtype=long),
predictx),axis=1)
#default: euclidianDistance
else:
distance = trainX.apply(lambda x:
euclidianDistance(np.array(x,dtype=long),predictx),axis=1)
#find nearest K Neighbours
knearest = sorted(zip(distance,trainY.iloc[:,0]))[:k]
#find the top class
df = pd.DataFrame(knearest,columns=['distance','class'])
#Groud by class and return class label of largest group
return df['class'].value_counts().index.get_values()[0]
#return training folds indexs, validation fold index
def kfoldvalidation(observations,folds):
foldSize = observations/folds
allObservationIndices = np.arange(0,observations)
#return (fold-1) indexes
for i in xrange(0,foldSize*(folds-1),foldSize):
validationIndices = np.arange(i,i+foldSize)
trainingIndices = list(set(allObservationIndices) - set(validationIndices))
yield trainingIndices, validationIndices
#last fold might have more thean foldSize indexes if obeservations is not exact
multiple of folds
validationIndices = np.arange(foldSize*(folds-1),observations)
trainingIndices = list(set(allObservationIndices) - set(validationIndices))
yield trainingIndices,validationIndices
def knnWithNfolds(observationsX,observationsY,k,folds,df):
totalObservations = len(observationsX)
print "# observations:{0}, K: {1}, Folds: {2}".format(totalObservations,k,folds)
currentFold = 0
predictions = []
for trainFold,validationFold in kfoldvalidation(totalObservations,folds):
currentFold += 1
print "knn of fold:{0} with k:{1}".format(currentFold,k)
sys.stdout.flush()
trainX = observationsX.ix[trainFold]
trainY = observationsY.ix[trainFold]
testX = observationsX.ix[validationFold]

testY = observationsY.ix[validationFold]
predictedY = []
#for each test sample get the predicted class (y)
for i in xrange(0,len(testX)):
predictedY.append(knn(trainX,trainY,testX[i:i+1],k,df))
#Find acurrecy 1: Correctly Prdicted, 0 for wrong prediction
#print predictedY,testY.iloc[:,0].tolist()
predictions.extend([1 if a == b else 0 for a,b in
zip(predictedY,testY.iloc[:,0].tolist())])
meanAccuracy = np.mean(predictions)
stdofAccuracy = np.std(predictions)
return meanAccuracy,stdofAccuracy
#return predictions

def test(trainX,trainY,df):
ma = {}
st = {}
for folds in xrange(2,6):
meanAccuracyPerK = []
stdPerK = []
for k in xrange(1,6):
meanAccuracy,stdofAccuracy = knnWithNfolds(trainX,trainY,k,folds,df)
meanAccuracyPerK.append(meanAccuracy)
stdPerK.append(stdofAccuracy)
ma[folds] = meanAccuracyPerK
st[folds] = stdPerK
return ma, st

def test_1():
data = pd.read_csv('datasets/breast-cancer-wisconsin.data', header=None)
data = data.reindex(np.random.permutation(data.index))
#clean data
data.replace('?',np.nan,inplace=True)
data.dropna(inplace=True)
trainX = data[[1,2,3,4,5,6,7,8,9]]
trainY = data[[10]]
trainX.reset_index(drop=True,inplace=True)
trainY.reset_index(drop=True,inplace=True)
return test(trainX,trainY,'ed')

def test_2():
data = pd.read_csv('datasets/iris.data', header=None)
labels = { "Iris-setosa" : 1,
"Iris-versicolor" : 2,
"Iris-virginica" : 3 }
data = data.reindex(np.random.permutation(data.index))
data = data.reindex(np.random.permutation(data.index))
data = data.reindex(np.random.permutation(data.index))
data[5] = [labels[i] for i in data[4]]
trainX = data[[0,1,2,3]]
trainY = data[[5]]

trainX.reset_index(drop=True,inplace=True)
trainY.reset_index(drop=True,inplace=True)
return test(trainX,trainY,'ed')
def test_3():
df = pd.read_csv('datasets/wine.data', header=None)
data = df
df = data
data = data.reindex(np.random.permutation(data.index))
trainX = data[list(np.arange(1,14))]
trainY = data[[0]]
trainX.reset_index(drop=True,inplace=True)
trainY.reset_index(drop=True,inplace=True)
return test(trainX,trainY,'ed')
def test_4():
data = pd.read_csv('datasets/balance-scale.data', header=None)
data = data.reindex(np.random.permutation(data.index))
trainX = data[[1,2,3,4]]
trainY = data[[0]]
trainX.reset_index(drop=True,inplace=True)
trainY.reset_index(drop=True,inplace=True)
return test(trainX,trainY,'hd')
def plotThem(median,std,title):
f, ax = plt.subplots(2, 2)
#plt.subplots_adjust(left=0, bottom=0, right=1, top=1, wspace=.3, hspace=.3)
plt.suptitle(title)
x = np.arange(1,len(median[2])+1)
ax[0, 0].errorbar(x,median[2],yerr=std[2],fmt='-o',clip_on=False)
ax[0, 0].set_title('2 Fold')
ax[0, 0].margins(0.1,0.1)
ax[0, 1].errorbar(x,median[3],yerr=std[3],fmt='-o')
ax[0, 1].set_title('3 Fold')
ax[0, 1].margins(0.1,0.1)
ax[1, 0].errorbar(x,median[4],yerr=std[4],fmt='-o')
ax[1, 0].set_title('4 Fold')
ax[1, 0].margins(0.1,0.1)
ax[1, 1].errorbar(x,median[5],yerr=std[5],fmt='-o')
ax[1, 1].set_title('5 Fold')
ax[1, 1].margins(0.1,0.1)
plt.setp([a.get_xticklabels() for a in ax[0, :]], visible=False)
plt.setp([a.get_yticklabels() for a in ax[:, 1]], visible=False)

m1,
m2,
m3,
m4,

s1
s2
s3
s4

=
=
=
=

test_1()
test_2()
test_3()
test_4()

plotThem(m1,s1,"breast-cancer-wisconsin")

plotThem(m2,s2,"iris data")
plotThem(m3,s3,"wine.data")
plotThem(m4,s4,"balance scale data")

Вам также может понравиться