Вы находитесь на странице: 1из 5

# Reading in the Telecom File

Telecom <- read.csv("C:/Users/Sarita/Desktop/dataprep1.csv",na.strings="")


names(Telecom)
dim(Telecom)
# Q1. How many variables have missing values?
# Creating a dataset to hold the number of missing and non-missing counts
Missing_Values <- data.frame(Variable=names(Telecom),
Missing_Count=1:16, Non_Missing_Count=1:16)
# Finding the number of missing and non-missing values for the variables
for(i in 1:16)
{
Missing_Values[i,2] <- sum(ifelse(is.na(Telecom[i]),1,0))
Missing_Values[i,3] <- sum(ifelse(is.na(Telecom[i]),0,1))
}

Jigsaw Academy 2013

# Q2. If all missing values were deleted, what will be the total loss of records?
Non_Missing <- with(Telecom,sum(ifelse(is.na(sbscrp_id) | is.na(minuse1) | is.na(minuse2)
| is.na(minuse3) | is.na(minuse4) | is.na(Plan.Type) | is.na(prom2) | is.na(prom3)
| is.na(prom4) | is.na(prom5) | is.na(svc_start_dt) | is.na(NEW_CELL_IND) | is.na(BIRTH_DT)
| is.na(zip_code),0,1)))
> Non_Missing
[1] 1974
# Note that we should not include missing values in svc_end_date because they are not actually
missing data but are active customers.
# Q3.If you decide to update the missing values with mean, what is the impact on standard
deviation of each variable?
# Minimal, since there are verry few missing values.
# We will check with minuse2.
# Original Mean of minuse2 without the missing values
Minuse2 <- mean(Telecom$minuse2,na.rm=TRUE)
Minuse2
[1] 184.869
# Imputing the missing values in minuse2 with its mean
Telecom$minuse2 <- ifelse(is.na(Telecom$minuse2),Minuse2,Telecom$minuse2)
mean(Telecom$minuse2,na.rm=TRUE)
[1] 184.869
# Q4. If you decide to impute missing values, how could you potentially impute minuse4?
# Listing all other records where minuse4 is missing
Telecom[which(is.na(Telecom$minuse4)),]

Jigsaw Academy 2013

# Notice that 2 out of the three records are for a customer that has attrited.
It may be a good idea to only impute data for the record that includes the non-attrited customer
Check <- Telecom[!is.na(Telecom$minuse4),]

# There are 1997 customers now in check dataframe


# minuse3 and minuse4 are missing for this non-attrited customer
# We can impute both using customers who are similar to this non-attrited customer
Check1 <- Telecom[which((Telecom$minuse1 > 120) & (Telecom$minuse1 < 150) &
(Telecom$minuse2 > 140) & (Telecom$minuse2 < 170)),]
mean(Check1$minuse3,na.rm=T)
[1] 241.375
mean(Check1$minuse4,na.rm=T)
[1] 274.25
Now can replace the missing minuse3 and 4 with the generated mean values.
# Q5. How many outliers can you identify for each variable?
The variables with potential outliers are: minuse1 minuse4, and perhaps age.
summary(Telecom[,2:5])

Jigsaw Academy 2013

# summary() prints the basic summary of the data - Minimum, Maximum, Mean, Median etc.
# There are clearly two potential outliers/wrong data. These should be removed post
investigation.
# We need to re-generate the summary and the frequency distributions after these observations
are removed.
Telecom_New <- Telecom[-which(Telecom$minuse2 == -55),]
Telecom_New_1 <- Telecom_New[-which(Telecom_New$minuse4 == 177700),]
summary(Telecom_New_1[,2:5])

# Q6. If you replaced the top two extreme values by the mean for any one of the variables,
how would the standard deviation change?
# Mean and Stdev including 177700 on minuse4
> mean(Telecom$minuse4,na.rm=T)
[1] 299.5613
> sd(Telecom$minuse4,na.rm=T)
[1] 3974.518
# Mean and Stdev excluding 177700 on minuse4
Telecom_min4 <- Telecom[-which(Telecom$minuse4 == 177700),]
mean(Telecom_min4$minuse4,na.rm=T)
[1] 210.6834
sd(Telecom_min4$minuse4,na.rm=T)
Jigsaw Academy 2013

[1] 148.0233
table(Telecom$minuse4)
# 1133 and 177700 are the 2 top values
# Replacing the top 2 values with the mean
Telecom$minuse4 <- ifelse(Telecom$minuse4 == 1133,210.6834,Telecom$minuse4)
Telecom$minuse4 <- ifelse(Telecom$minuse4 == 177700,210.6834,Telecom$minuse4)
> mean(Telecom$minuse4,na.rm=T)
[1] 210.2215
> sd(Telecom$minuse4,na.rm=T)
[1] 146.5385

Jigsaw Academy 2013

Вам также может понравиться