Ad Code

R Script for Correlation and Regression


# CORRELATION AND REGRESSION
library(readxl)
hsb2 <- read_excel("C:/Users/.xls")

sapply(hsb2, class)

# make variables into factors
hsb2$female <- factor(hsb2$female)
hsb2$race <- factor(hsb2$race)
hsb2$ses <- factor(hsb2$ses)
hsb2$schtyp <- factor(hsb2$schtyp)
hsb2$prog <- factor(hsb2$prog)

summary(hsb2)

# selecting only numeric variables
hsb3 <- hsb2[, purrr::map_lgl(hsb2, is.numeric)]
hsb3 <- hsb3[, 2:6]

cor(hsb3)

cor(hsb2$read, hsb2$write)

shapiro.test(hsb2$write)

cor(hsb2$read, hsb2$write, method = "spearman") # "kendall", "spearman" 

cor.test(hsb2$read, hsb2$write, method = "pearson") # "kendall", "spearman" 

cor1 <- cor(hsb2[, 7:11], method = "kendall")

library(corrplot)
corrplot(cor1, method = "number", type = "lower")

# "circle", "square", "ellipse", "number", "shade", "color", "pie"
# "upper", "lower", "full"

col <- colorRampPalette(c("red", "white", "blue"))(20)
corrplot(cor1, type = "upper", order = "hclust", col = col)

# For more options, refer
# http://www.sthda.com/english/wiki/visualize-correlation-matrix-using-correlogram

str(hsb2)
class(hsb2)
hsb2$female

# splitting datasets
library(caTools)

set.seed(12122)
nrow(hsb2)
names(hsb2)

split1 <- sample.split(hsb2$female, SplitRatio = 0.7)
train <- subset(hsb2, split1 == TRUE)
test <- subset(hsb2, split1 == FALSE)
rm(split1)

nrow(train)
nrow(test)

# regression model
model1 <- lm(math ~ read + write, data = hsb2)
attributes(model1)
summary(model1)

hsb2$pred <- predict(model1, hsb2)

str(test)
model1

# STEPWISE REGRESSION
step.model <- stepAIC(fit, direction = "both", trace = FALSE)
# options: "both" "forward" "backward"

# OTHER COMMANDS
confint(model1, level = 0.99)

install.packages("QuantPsyc")
library(QuantPsyc)
lm.beta(model1)

anova(model1)

# FINDING ACCURACY
rmse <- sqrt(mean(hsb2$pred - hsb2$math)^2)
mape <- mean(abs((hsb2$pred - hsb2$math) * 100 / hsb2$math))

rmse
mape

sapply(hsb2, class)
hsb2$race <- factor(hsb2$race)
hsb2$female <- factor(hsb2$female)

# REGRESSION DIAGNOSTICS
install.packages("car")
library(car)

fit <- lm(math ~ read + write, data = hsb2)

# Homogeneity of Variance tests
var.test(math ~ female, data = hsb2)
leveneTest(math ~ female, data = hsb2)
fligner.test(math ~ race, data = hsb2)

# Normality of residuals
qqPlot(fit, main = "QQ Plot")

# Assessing outliers
outlierTest(fit)
qqPlot(fit, main = "QQ Plot")
leveragePlots(fit)

# Cook's D plot
cutoff <- 4 / ((nrow(hsb2) - length(fit$coefficients) - 2))
plot(fit, which = 4, cook.levels = cutoff)

# Influence plot
influencePlot(fit,
              id.method = "identify",
              main = "Influence Plot",
              sub = "Circle size is proportional to Cook's Distance")

# Distribution of studentized residuals
library(MASS)
sresid <- studres(fit)

hist(sresid, freq = FALSE,
     main = "Distribution of Studentized Residuals")

xfit <- seq(min(sresid), max(sresid), length = 40)
yfit <- dnorm(xfit)
lines(xfit, yfit)

# Multicollinearity detection
vif(fit)

# Autocorrelation
library(lmtest)
acf(fit$residuals)
dwtest(fit)

# Evaluate homoscedasticity
ncvTest(fit)

Post a Comment

0 Comments

Close Menu