Вы находитесь на странице: 1из 8

Clustering_Rcode.

R
User02

Tue Mar 20 14:28:48 2018


rm(list =ls())
setwd("C:/Users/User02/Google Drive/Business Analytics/Business Analytics
Video/Moodle Upload/10. Clustering/Clustering Case on R/")

##Read the data in the file


cust_data<-read.csv("cluster.csv")
### Select the requried columns for clustering

head(cust_data)

## Visit_ID Avg_Session_Duration Pages_Per_Session Channel Age Gender


## 1 100001 17 6 1 26 0
## 2 100002 7 4 1 30 1
## 3 100003 17 4 1 33 1
## 4 100004 9 3 1 27 0
## 5 100005 17 4 1 34 1
## 6 100006 8 6 1 37 0
## Transaction
## 1 14833
## 2 13189
## 3 15459
## 4 9857
## 5 7985
## 6 15503

cust_data<- cust_data[-c(1)]
summary(cust_data)

## Avg_Session_Duration Pages_Per_Session Channel Age


## Min. : 0.00 Min. :1.000 Min. :0.000 Min. :18.00
## 1st Qu.: 3.00 1st Qu.:3.000 1st Qu.:0.000 1st Qu.:27.00
## Median : 8.00 Median :4.000 Median :1.000 Median :32.00
## Mean : 8.41 Mean :3.678 Mean :0.693 Mean :33.74
## 3rd Qu.:14.00 3rd Qu.:5.000 3rd Qu.:1.000 3rd Qu.:39.00
## Max. :19.00 Max. :6.000 Max. :1.000 Max. :58.00
## Gender Transaction
## Min. :0.000 Min. : 2015
## 1st Qu.:0.000 1st Qu.: 4159
## Median :1.000 Median : 6376
## Mean :0.527 Mean : 7857
## 3rd Qu.:1.000 3rd Qu.:11722
## Max. :1.000 Max. :16962
cust_data_f<- scale(cust_data)

head(cust_data_f)

## Avg_Session_Duration Pages_Per_Session Channel Age


## [1,] 1.39253043 1.5768270 0.6652507 -0.79388819
## [2,] -0.22857601 0.2186642 0.6652507 -0.38366306
## [3,] 1.39253043 0.2186642 0.6652507 -0.07599421
## [4,] 0.09564528 -0.4604172 0.6652507 -0.69133191
## [5,] 1.39253043 0.2186642 0.6652507 0.02656208
## [6,] -0.06646536 1.5768270 0.6652507 0.33423093
## Gender Transaction
## [1,] -1.0550122 1.58332777
## [2,] 0.9469085 1.21018406
## [3,] 0.9469085 1.72541291
## [4,] -1.0550122 0.45390981
## [5,] 0.9469085 0.02901624
## [6,] -1.0550122 1.73539972

dist.res=dist(cust_data_f,method = "euclidean")

hc<- hclust(dist.res,method="complete")

#Visulize of hclust

plot(hc,labels=FALSE,hang=-1)

rect.hclust(hc,k=3,border = 2:3)

####K-means clustering

#install.packages("vegan")
#install.packages("permute")

library(vegan)

## Warning: package 'vegan' was built under R version 3.4.4

## Loading required package: permute

## Warning: package 'permute' was built under R version 3.4.4

## Loading required package: lattice

## This is vegan 2.4-6


library(permute)
library(lattice)
fit <- cascadeKM(scale(cust_data, center = TRUE, scale = TRUE), 1, 10, iter =
1000)
plot(fit, sortg = TRUE, grpmts.plot = TRUE)
calinski.best <- as.numeric(which.max(fit$results[2,]))
cat("Calinski criterion optimal number of clusters:", calinski.best, "\n")

## Calinski criterion optimal number of clusters: 2

# Also looking at the elbow chart


mydata <- cust_data

#Determine the optimal cluster size based on within sum of squares


wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var))

for (i in 2:15) wss[i] <- sum(kmeans(mydata,centers=i)$withinss)

#Plot the elbow chart to determine optimal cluster


plot(1:15, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares",col="mediumseagreen",pch=12)
# From elbow chart it looks like 4 clusters although the change is low after
2. So we can either consider 2 or 4. Lets stick with 2 as calinski criterion
also suggests that.

###Run the kmeans algorithm to generate the clusters


k1<-kmeans(cust_data_f, 2)

k1

## K-means clustering with 2 clusters of sizes 414, 586


##
## Cluster means:
## Avg_Session_Duration Pages_Per_Session Channel Age
## 1 -0.9995805 -0.7671520 -0.4548586 0.13308189
## 2 0.7061883 0.5419811 0.3213506 -0.09402031
## Gender Transaction
## 1 -0.04921630 -0.8983283
## 2 0.03477056 0.6346551
##
## Clustering vector:
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [35] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [69] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [103] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [137] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [171] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2
## [205] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [239] 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [273] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [307] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2
## [341] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [375] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [409] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [443] 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [477] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2
## [511] 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 1 1 1 2 1 1 1 2 2 2 2 2 2 2 2 2 2 2 1
## [545] 2 2 2 2 2 2 2 2 1 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2
## [579] 2 2 2 2 2 2 2 2 2 1 2 1 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1
## [613] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [647] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [681] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [715] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [749] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [783] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [817] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [851] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [885] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [919] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [953] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [987] 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 2086.813 2054.992
## (between_SS / total_SS = 30.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"

###See the clustering results


###Fetch the group means for each variable
k1$centers

## Avg_Session_Duration Pages_Per_Session Channel Age


## 1 -0.9995805 -0.7671520 -0.4548586 0.13308189
## 2 0.7061883 0.5419811 0.3213506 -0.09402031
## Gender Transaction
## 1 -0.04921630 -0.8983283
## 2 0.03477056 0.6346551

###Fetch size/n of obs for the groups


k1$size

## [1] 414 586


###Fetch the cluster for each obs
k1$cluster

## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [35] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [69] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [103] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [137] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [171] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2
## [205] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [239] 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [273] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [307] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2
## [341] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [375] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [409] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [443] 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [477] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2
## [511] 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 1 1 1 2 1 1 1 2 2 2 2 2 2 2 2 2 2 2 1
## [545] 2 2 2 2 2 2 2 2 1 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2
## [579] 2 2 2 2 2 2 2 2 2 1 2 1 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1
## [613] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [647] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [681] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [715] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [749] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [783] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [817] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [851] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [885] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [919] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [953] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [987] 1 1 1 1 1 1 1 1 1 1 1 1 1 1

cust_data$cluster=k1$cluster

View(cust_data)

#Silhoutte plot for checking how good are the clusters


library(cluster)

diss=daisy(cust_data_f)

## Warning in daisy(cust_data_f): binary variable(s) 3, 5 treated as interval


## scaled

sp=silhouette(cust_data$cluster,diss)
windows()
plot(sp)
# Analysing the clusters
aggregate(.~cluster, data=cust_data, mean)

## cluster Avg_Session_Duration Pages_Per_Session Channel Age


## 1 1 2.243961 2.548309 0.4830918 35.03865
## 2 2 12.766212 4.476109 0.8412969 32.82423
## Gender Transaction
## 1 0.5024155 3899.297
## 2 0.5443686 10653.329

Вам также может понравиться