Вы находитесь на странице: 1из 20

# Commands

y=x\$name of column or variable
plot(x,y,type="l")
hist(y)
z=c(10,15,20)
Cluster
rownames(P)=P\$column name
m=dist(as.matrix(P))
hc=hclust(m)
plot(hc)
Scatterplot
extract minimum 2 variables to plot
scatterplot(X~Y,data=name of file,xlab=" ",ylab=" ",main=" ")
res=lm(X~Y)
res=signif(residuals(res),5)
res & enter
Pie chart
X=c()
Y=c()
pie(X,labels=Y)
Creating functions
myfun=function(x)sum(x)/length(x)
d=c(5,10,15,20)
myfun(d)
Linear regression
cor(x,y)
data=lm(yaxis~xaxis)
summary(data)
attributes(data)
data\$coef
abline(data)
confint(data,level=any%value)
anova(data)
Checking linear regression
after continuing above steps
plot(data)
par(mfrow=c(2,2))

Statistics
replace(x, list, values)
scrub(x, where, min, max, isvalue,newvalue)
x <- as.matrix()
x%in%y
all(x%in%y)
all(x)
max(x, na.rm=TRUE)
var(x, na.rm=TRUE)
sd(x, na.rm=TRUE)
fivenum(x, na.rm=TRUE)
table(x)
scale(data,scale=FALSE)
cumsum(x,na=rm=TRUE)
rev(x)
cor(x,y,use="pair")
aov(x~y,data=datafile)
aov.ex1 = aov(DV~IV,data=data.ex1)
aov.ex2 = aov(DV~IV1*IV21,data=data.ex2)
summary(aov.ex1)
print(model.tables(aov.ex1,"means"),digits=3)
boxplot(DV~IV,data=data.ex1)
lm(x~y,data=dataset)
t(X)
X %*% Y
solve(A)
solve(A,B)
Table
table(train\$Survived)
prop.table(table(train\$Survived))
table(<data_variable_1>, <data_variable_2>)
prop.table(table(train\$Child,train\$Survived),1)

## write.csv(submit, "./submit.csv", row.names=F)

tapply(variable1,var2,mean)
which.min/which.max
subset(filename,var1 >1000)
sd(variablename,na.rm=TRUE)
count()

for(i in 1:max){
+ file_name<-paste("result",i,sep = "")
+ file_name1=subset(Train,Train\$Group == i)
+ assign(file_name,file_name1)
+}
for(i in 1:max){filename= paste("A",i,sep = "")
try=eval(as.name(paste("result",i,sep = "")))
assign(filename,try)}

object=summary(filename)
write.csv(t(as.matrix(object)), file="name.csv")
colnames(data)[colnames(data)=="old_name"] <- "new_name"
paste0()
substr
Train\$columnname=NULL

Used for
For i/p of csv file
extracting variable or whole column
plotting graph of x and y,joined with lines or type P for points
Frequency distribution
assigning values to z

calculate distance betn cluster elements
Create cluster depending upon dist.
plot cluster

## code for ploting scatterplot

for finding residual points
to store residual points
to view residuals

## get data frequency

get data names
to view pie chart

## func to divide sum by length

store string in d
calculate function value

pearson correlation
to fit linear regression
to check summary of linear regression
to see the names and class
to extract coefficient
to plot a line
to improve plotting
to create anova

## remember to assign this to some object i.e., x <- replace(x,x==-9,NA)

combine different kinds of data into a data frame
converts a data frame to standardized scores
tests each element of x for membership in y
true if x is a proper subset of y
for a vector of logical values, are they all true?
Find the maximum value in the vector x, exclude missing values
produces the variance covariance matrix
standard deviation
(median absolute deviation
Tukey fivenumbers min, lowerhinge, median, upper hinge, max
frequency counts of entries, ideally the entries are factors(although it works with integers or even reals
centers around the mean but does not scale by the sd)
cumulative sum
reverse the order of values in x
correlation matrix for pairwise complete data, use="complete" for complete cases
where x and y can be matrices
do the analysis of variance or
do a two way analysis of variance
show the summary table
report the means and the number of subjects/cell
graphical summary appears in graphics window
basic linear model where x and y can be matrices
transpose of matrix X
matrix multiply X by Y
inverse of A
inverse of A * B

## create a table with content of Survived

create a table with percentage with content of Survived
Create a Matrices
create a table(Matrices) with percentage with content of Survived

very imp

## to save satistical summary

to change column name
to concentuate
to select specific string
to remove certain Column

Commands
t.str <- strptime(Timeseriesmin\$TimeSeries, "%Y-%m-%d %H:%M:%S")
S.str <- as.numeric(format(t.str, "%H"))*60*60 + as.numeric(format(t.str, "%M"))*60+as.numeric
(format(t.str,"%S")
h.str <- as.numeric(format(t.str, "%H")) +
+
as.numeric(format(t.str, "%M"))/60
as.Date(Train\$DOB, "%d-%b-%Y")
data\$Transaction_Year <- format(data\$Transaction_Date, "%Y")
DateConvert = as.Date(strptime(mvt\$Date, "%m/%d/%y %H:%M"))

## # converting DOB to Date format

data\$DOB[nchar(data\$DOB) == 8] <- paste0("0", data\$DOB[nchar(data\$DOB) == 8])
data\$DOB <- paste0(substr(data\$DOB,1,7), "19", substr(data\$DOB,8,9))
data\$DOB <- as.Date(data\$DOB, "%d-%b-%Y")

## data\$Age <- as.numeric(as.Date("2016-01-01") - data\$DOB) / 365

Used for
Conversion into proper form
to convert time into secs
to convert time into hrs
to convert into data format
to extract year from date format
to extract date from timestamp

calculating age

Sequence no.

Name

Packages

Combining of dataset

Exploration of data

Data cleaning

Feature engineering

10

Model Building

11

## Storage of Submission files

Codes
library(data.table)
library(dplyr)
library(ggplot2)
library(randomForest)
library(caret)
library(dummies)

## path <- ""

setwd(path)
train<- read.csv("train.csv" , stringsAsFactors = F)

test\$Loan_Status<- "N"
combi<- rbind(train , test)

str(train)
summary()
Explorating categorical variables
table()
Plotting

## Conversion into factor

combi\$Gender<- as.factor(combi\$Gender)
Count of Missing&NA values
colSums(is.na(Loantrain))
colSums(LTrain=="")

## Imputing missing & NA values

LTrain\$Gender[is.na(LTrain\$Gender)]="Male"
levels(LTrain\$Gender)[levels(LTrain\$Gender)== ""]<- "Male"
combi\$Loan_Amount_Term[combi\$Loan_Amount_Term== -1]<- median(combi\$Loan_Amount_Term)
combi\$LoanAmount[combi\$LoanAmount== ""]<- mean(combi\$LoanAmount)

Timeseriesmin[complete.cases(Timeseriesmin), ]
combi\$ls<- with(combi , combi\$ApplicantIncome+ combi\$CoapplicantIncome)

## Replacing with a no.

data\$Gender <- ifelse(data\$Gender == "F", 1, 0)

## To find important variable

cor()
combi<- combi[,-c(1,2,4,5,6,7,8,9,10,12)]

## How to split data into train & test

split=sample.split(quality\$PoorCare,SplitRatio = 0.75)
qualitytrain=subset(quality,split==TRUE)
qualitytest=subset(quality,split==FALSE)
Separation of datasets
train1<- combi[1:nrow(train), ]
test1<- combi[-(1:nrow(train)), ]
Random_forest_Aman
Xgboost-RohanRao_seer

## Creation of submission files

sub_file <- data.frame(Loan_ID = test\$Loan_ID, Loan_Status = main_predict)
write.csv(sub_file, 'r_f2.csv')

Description

## Setting working directory

fastest way to load large dataset

## To calculate no. of Na values

To calculate no. of Blank values

to
to
to
to

replace
replace
replace
replace

NA values
blank values
with median
with mean

## to remove NaN values from dataset

creation of new variable

Click for code

onlinecode
onlinecode