Вы находитесь на странице: 1из 5

# rm(list=ls(all=T))

mydata

# data exploration
#the given data is from insurance industry

dim(mydata)

## # In the given data set all the variables are numeric.

#The committee is interested to know each field of the data collected through
descriptive analysis
#to gain basic insights into the data set and to prepare for further analysis.

summary(mydata)

# Splitting the data into Training data set and Testing dataset
#split ratio be 70:30

library(caTools)

# spliting the data : 70% for training and 30% for testing on the basis of dependent
variable

set.seed(125)
sample= sample.split(mydata\$Payment,SplitRatio = 0.70)
sample

## #Training Data set

train_data=subset(mydata,sample==TRUE);train_data

## #Testing data set

test_data= subset(mydata,sample==FALSE);train_data

summary(model_1)

## #The p-values for the model_1 are in column 4 under coefficients

#Except for Bonus and Make, the p-value for all the independent variables are less
than 0.05
#we can say that, except Bonus and Make , all other independent variables are
significant
#the Insured and Claims are the most significant variable as there p-value is very
small.
#The slope for Kilometres= 4.671e+03, Zone=2.825e+03 , Insured=2.916e+01,
Claims=4.319e+03

cor(mydata[,-c(3,4)])

#Insured and claim show a strong positive correlation with the dependent variable,
#whereas, Kilometer and Zone show a weak negative correlation with Payment.

#Both multiple R-square and adjusted R-square are same for model_1: 0.9947

#since, we have more than one significant variable so we will go for new model
#with significant variables as independent variables

## model_1a=lm(Payment~ Kilometres+Zone+Insured+Claims, data=train_data)

summary(model_1a)

## #The p-values for the model_1a are in column 4 under coefficients

#The p-value for all the independent variables are less than 0.05
#we can say that all the independent variables are significant
#the Insured and Claims are the most significant variable as there p-value is very
small.
#The slope for Kilometres= 4.625e+03 , Zone=2.782e+03 , Insured=2.948e+01,
Claims=4.310e+03

cor(mydata[,-c(3,4)])

#Insured and claim show a strong positive correlation with the dependent variable,
#whereas, Kilometer and Zone show a weak negative correlation with Payment.

#Both multiple R-square and adjusted R-square are same for model_1: 0.9947

#so model_1a is the final model as all the independent variables are significant i.e
#Kilometres, Zone, Insured and Claims all have significant effect on the Payment.

## #To visualize the results for better understanding.

par(mfrow=c(1,2))
plot(mydata\$Claims, mydata\$Payment, xlab="Number of Claims", ylab="Payments",
main="")

## plot(mydata\$Insured,mydata\$Payment,xlab="The number of insured in policy-years",

ylab="Payments", main="")

## #prediction on the on testing data set

predtest = predict(model_1a,test_data)

## #transform into data frame

pred_payment=data.frame(predtest)

#Bind the predicted data set with original data set by cbind function

## #export the final file with predicted values

write.csv(final_mydata,"InsuranceFinal.csv")

#The insurance company is planning to establish a new branch office, so they are
interested
#to find at what location, kilometer, and bonus level their insured amount, claims,
#and payment get increased.

library(dplyr)

#grouping the data according to Zone and comparing the mean of different zone

## grupzone= apply(mydata[,c(5,6,7)], 2, function(x) tapply(x, mydata\$Zone, mean))

grupzone
# Zone 4 has the highest number of claims, and thus payment as well.
# Zones 1-4 have more insured years, claims, and payments.

#grouping the data according to Kilometers and comparing the mean of different zone

grupkil= apply(mydata[,c(5,6,7)],2,function(x)tapply(x,mydata\$Kilometres,mean))
grupkil

## # Kilometer group 2 has the maximum payments.

#Though the insured number of years is lesser than kilometre 1, the claims and
payments are higher for group 2

#grouping the data according to Bonus and comparing the mean of different zone

grupbon=apply(mydata[,c(5,6,7)],2,function(x)tapply(x,mydata\$Bonus,mean))
grupbon

## Bonus group 7 has the maximum number of claims and Payment

#The committee wants to understand what affects their claim rates so as to decide the
right
#premiums for a certain set of situations. Hence, they need to find whether the insured
#amount, zone, kilometer, bonus, or make affects the claim rates and to what extent.

model_2 = lm(Claims~.,data=mydata)
summary(model_2)

#Dependent variable: claims Independent variable: kilometres, zone, bonus, make, and
insured
#The results provides the intercept and estimated value and this in turn shows
#that all the p values of independent variables, such as kilometres, zone, bonus, make,
and
#insured are highly significant and are making an impact on the claims.