Visualizing Data For Regression
read.auto <- function(file = 'Automobile price data _Raw_.csv'){
## Read the csv file
auto.price <- read.csv(file, header = TRUE, stringsAsFactors = FALSE)
numcols <- switch(
as.character((!exists("numcols") || is.na(numcols) || is.null(numcols))[1]),
"TRUE" = { # if numcols does not exist or is not populated ("TRUE"),
# use these defaults:
c('price', 'bore', 'stroke', 'horsepower', 'peak.rpm')
}, # else use existing values
numcols)
print(c("numeric columns:", numcols))
for(col in c('price', 'bore', 'stroke', 'horsepower', 'peak.rpm')){
for (idx in (1:length(auto.price[,col]))) {
temp = auto.price[idx,col]
if (temp == '?' || is.na(auto.price[idx,col])) {
# Convert ? to NA
auto.price[idx,col] = ifelse(temp == '?', NA, auto.price[idx,col])
print(c(col, idx, auto.price[idx,col]))
}
}
## Coerce some character columns to numeric
auto.price[,col] = as.numeric(auto.price[,col])
}
## Remove cases or rows with missing values.
## Keep the rows which do not have NAs.
auto.price = auto.price[complete.cases(auto.price[, numcols]), ]
## Drop some unneeded columns
auto.price[,'symboling'] = NULL
auto.price[,'normalized.losses'] = NULL
return(auto.price)
}
cat_cols <- c('fuel.type', 'aspiration', 'num.of.doors', 'body.style',
'drive.wheels', 'engine.location', 'engine.type', 'num.of.cylinders')
plotcols <- c('price', 'city.mpg', 'curb.weight', 'engine.size', 'horsepower', 'fuel.type')
auto_prices <- read.auto()
## [1] "numeric columns:" "price" "bore" "stroke"
## [5] "horsepower" "peak.rpm"
## [1] "price" "10" NA
## [1] "price" "45" NA
## [1] "price" "46" NA
## [1] "price" "130" NA
## [1] "bore" "56" NA
## [1] "bore" "57" NA
## [1] "bore" "58" NA
## [1] "bore" "59" NA
## [1] "stroke" "56" NA
## [1] "stroke" "57" NA
## [1] "stroke" "58" NA
## [1] "stroke" "59" NA
## [1] "horsepower" "131" NA
## [1] "horsepower" "132" NA
## [1] "peak.rpm" "131" NA
## [1] "peak.rpm" "132" NA
print("all auto columns:")
## [1] "all auto columns:"
colnames(auto_prices)
## [1] "make" "fuel.type" "aspiration"
## [4] "num.of.doors" "body.style" "drive.wheels"
## [7] "engine.location" "wheel.base" "length"
## [10] "width" "height" "curb.weight"
## [13] "engine.type" "num.of.cylinders" "engine.size"
## [16] "fuel.system" "bore" "stroke"
## [19] "compression.ratio" "horsepower" "peak.rpm"
## [22] "city.mpg" "highway.mpg" "price"
head(auto_prices)
## make fuel.type aspiration num.of.doors body.style drive.wheels
## 1 alfa-romero gas std two convertible rwd
## 2 alfa-romero gas std two convertible rwd
## 3 alfa-romero gas std two hatchback rwd
## 4 audi gas std four sedan fwd
## 5 audi gas std four sedan 4wd
## 6 audi gas std two sedan fwd
## engine.location wheel.base length width height curb.weight engine.type
## 1 front 88.6 168.8 64.1 48.8 2548 dohc
## 2 front 88.6 168.8 64.1 48.8 2548 dohc
## 3 front 94.5 171.2 65.5 52.4 2823 ohcv
## 4 front 99.8 176.6 66.2 54.3 2337 ohc
## 5 front 99.4 176.6 66.4 54.3 2824 ohc
## 6 front 99.8 177.3 66.3 53.1 2507 ohc
## num.of.cylinders engine.size fuel.system bore stroke compression.ratio
## 1 four 130 mpfi 3.47 2.68 9.0
## 2 four 130 mpfi 3.47 2.68 9.0
## 3 six 152 mpfi 2.68 3.47 9.0
## 4 four 109 mpfi 3.19 3.40 10.0
## 5 five 136 mpfi 3.19 3.40 8.0
## 6 five 136 mpfi 3.19 3.40 8.5
## horsepower peak.rpm city.mpg highway.mpg price
## 1 111 5000 21 27 13495
## 2 111 5000 21 27 16500
## 3 154 5000 19 26 16500
## 4 102 5500 24 30 13950
## 5 115 5500 18 22 17450
## 6 110 5500 19 25 15250
str(auto_prices)
## 'data.frame': 195 obs. of 24 variables:
## $ make : chr "alfa-romero" "alfa-romero" "alfa-romero" "audi" ...
## $ fuel.type : chr "gas" "gas" "gas" "gas" ...
## $ aspiration : chr "std" "std" "std" "std" ...
## $ num.of.doors : chr "two" "two" "two" "four" ...
## $ body.style : chr "convertible" "convertible" "hatchback" "sedan" ...
## $ drive.wheels : chr "rwd" "rwd" "rwd" "fwd" ...
## $ engine.location : chr "front" "front" "front" "front" ...
## $ wheel.base : num 88.6 88.6 94.5 99.8 99.4 ...
## $ length : num 169 169 171 177 177 ...
## $ width : num 64.1 64.1 65.5 66.2 66.4 66.3 71.4 71.4 71.4 64.8 ...
## $ height : num 48.8 48.8 52.4 54.3 54.3 53.1 55.7 55.7 55.9 54.3 ...
## $ curb.weight : int 2548 2548 2823 2337 2824 2507 2844 2954 3086 2395 ...
## $ engine.type : chr "dohc" "dohc" "ohcv" "ohc" ...
## $ num.of.cylinders : chr "four" "four" "six" "four" ...
## $ engine.size : int 130 130 152 109 136 136 136 136 131 108 ...
## $ fuel.system : chr "mpfi" "mpfi" "mpfi" "mpfi" ...
## $ bore : num 3.47 3.47 2.68 3.19 3.19 3.19 3.19 3.19 3.13 3.5 ...
## $ stroke : num 2.68 2.68 3.47 3.4 3.4 3.4 3.4 3.4 3.4 2.8 ...
## $ compression.ratio: num 9 9 9 10 8 8.5 8.5 8.5 8.3 8.8 ...
## $ horsepower : num 111 111 154 102 115 110 110 110 140 101 ...
## $ peak.rpm : num 5000 5000 5000 5500 5500 5500 5500 5500 5500 5800 ...
## $ city.mpg : int 21 21 19 24 18 19 19 19 17 23 ...
## $ highway.mpg : int 27 27 26 30 22 25 25 25 20 29 ...
## $ price : num 13495 16500 16500 13950 17450 ...
summary(auto_prices)
## make fuel.type aspiration num.of.doors
## Length:195 Length:195 Length:195 Length:195
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## body.style drive.wheels engine.location wheel.base
## Length:195 Length:195 Length:195 Min. : 86.6
## Class :character Class :character Class :character 1st Qu.: 94.5
## Mode :character Mode :character Mode :character Median : 97.0
## Mean : 98.9
## 3rd Qu.:102.4
## Max. :120.9
## length width height curb.weight
## Min. :141.1 Min. :60.30 Min. :47.80 Min. :1488
## 1st Qu.:166.3 1st Qu.:64.05 1st Qu.:52.00 1st Qu.:2145
## Median :173.2 Median :65.40 Median :54.10 Median :2414
## Mean :174.3 Mean :65.89 Mean :53.86 Mean :2559
## 3rd Qu.:184.1 3rd Qu.:66.90 3rd Qu.:55.65 3rd Qu.:2944
## Max. :208.1 Max. :72.00 Max. :59.80 Max. :4066
## engine.type num.of.cylinders engine.size fuel.system
## Length:195 Length:195 Min. : 61.0 Length:195
## Class :character Class :character 1st Qu.: 98.0 Class :character
## Mode :character Mode :character Median :120.0 Mode :character
## Mean :127.9
## 3rd Qu.:145.5
## Max. :326.0
## bore stroke compression.ratio horsepower
## Min. :2.540 Min. :2.07 Min. : 7.00 Min. : 48.0
## 1st Qu.:3.150 1st Qu.:3.11 1st Qu.: 8.50 1st Qu.: 70.0
## Median :3.310 Median :3.29 Median : 9.00 Median : 95.0
## Mean :3.329 Mean :3.25 Mean :10.19 Mean :103.3
## 3rd Qu.:3.590 3rd Qu.:3.41 3rd Qu.: 9.40 3rd Qu.:116.0
## Max. :3.940 Max. :4.17 Max. :23.00 Max. :262.0
## peak.rpm city.mpg highway.mpg price
## Min. :4150 Min. :13.00 Min. :16.00 Min. : 5118
## 1st Qu.:4800 1st Qu.:19.50 1st Qu.:25.00 1st Qu.: 7756
## Median :5100 Median :25.00 Median :30.00 Median :10245
## Mean :5099 Mean :25.37 Mean :30.84 Mean :13248
## 3rd Qu.:5500 3rd Qu.:30.00 3rd Qu.:35.00 3rd Qu.:16509
## Max. :6600 Max. :49.00 Max. :54.00 Max. :45400
plot_bars <- function(df){
options(repr.plot.width=4, repr.plot.height=3.5) # Set the initial plot area dimensions
print("character columns:")
for(col in colnames(df)){
if(is.character(df[,col])){
print(col)
p <- ggplot(df, aes_string(col)) +
geom_bar(alpha = 0.6) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
print(p)
}
}
}
plot_bars(auto_prices)
## [1] "character columns:"
## [1] "make"
## [1] "fuel.type"
## [1] "aspiration"
## [1] "num.of.doors"
## [1] "body.style"
## [1] "drive.wheels"
## [1] "engine.location"
## [1] "engine.type"
## [1] "num.of.cylinders"
## [1] "fuel.system"
plot_dist <- function(df, plotcols){
options(repr.plot.width=4, repr.plot.height=3) # Set the initial plot area dimensions
for(col in plotcols){
#if(is.numeric(df[,col])){
p <- ggplot(df, aes_string(col)) +
geom_density(color = 'blue') +
geom_rug()
print(p)
#}
}
}
plot_dist(auto_prices, plotcols)
plot_scatter_t <- function(df, cols, col_y = 'price', alpha = 1.0){
options(repr.plot.width=4, repr.plot.height=3.5) # Set the initial plot area dimensions
for(col in cols){
p <- ggplot(df, aes_string(col, col_y)) +
geom_point(alpha = alpha) +
ggtitle(paste('Scatter plot of', col_y, 'vs.', col))
print(p)
}
}
plot_scatter_t(auto_prices, plotcols, alpha = 0.2)
plot_hex <- function(df, cols, col_y = 'price', bins = 30){
options(repr.plot.width=4, repr.plot.height=3.5) # Set the initial plot area dimensions
for(col in cols){
p <- ggplot(df, aes_string(col, col_y)) +
geom_hex(show.legend = TRUE, bins = bins) +
ggtitle(paste('2-D hexbin plot of', col_y, 'vs.', col))
print(p)
}
}
plot_hex(auto_prices, plotcols, bins = 10)
plot_box <- function(df, cols, col_y = 'price'){
options(repr.plot.width=4, repr.plot.height=3.5) # Set the initial plot area dimensions
for(col in cols){
p <- ggplot(df, aes_string(col, col_y)) +
geom_boxplot() +
ggtitle(paste('Box plot of', col, 'vs.', col_y))
print(p)
}
}
plot_box(auto_prices, cat_cols)
plot_violin <- function(df, cols, col_y = 'price', bins = 30){
options(repr.plot.width=4, repr.plot.height=3.5) # Set the initial plot area dimensions
for(col in cols){
p <- ggplot(df, aes_string(col, col_y)) +
geom_violin() +
ggtitle(paste('Violin plot of', col, 'vs.', col_y))
print(p)
}
}
plot_violin(auto_prices, cat_cols)
plot_scatter_sp <- function(df, cols, col_y = 'price', alpha = 1.0){
options(repr.plot.width=5, repr.plot.height=3.5) # Set the initial plot area dimensions
for(col in cols){
p <- ggplot(df, aes_string(col, col_y)) +
geom_point(aes(shape = factor(fuel.type)), alpha = alpha) +
ggtitle(paste('Scatter plot of', col_y, 'vs.', col, '\n with shape by fuel type'))
print(p)
}
}
plot_scatter_sp(auto_prices, plotcols, alpha = 0.2)
plot_scatter_sp_sz = function(df, cols, col_y = 'price', alpha = 1.0){
options(repr.plot.width=5, repr.plot.height=3.5) # Set the initial plot area dimensions
df$curb.weight.2 = df$curb.weight**2
for(col in cols){
p = ggplot(df, aes_string(col, col_y)) +
geom_point(aes(shape = factor(fuel.type), size = curb.weight.2), alpha = alpha) +
ggtitle(paste('Scatter plot of', col_y, 'vs.', col, '\n with shape by fuel type'))
print(p)
}
}
plot_scatter_sp_sz(auto_prices, plotcols, alpha = 0.1)
plot_scatter_sp_sz_cl = function(df, cols, col_y = 'price', alpha = 1.0){
options(repr.plot.width=5, repr.plot.height=3.5) # Set the initial plot area dimensions
df$curb.weight.2 = df$curb.weight**2
for(col in cols){
p = ggplot(df, aes_string(col, col_y)) +
geom_point(aes(shape = factor(fuel.type), size = curb.weight.2, color = aspiration),
alpha = alpha) +
ggtitle(paste('Scatter plot of', col_y, 'vs.', col,
'\n with shape by fuel type',
'\n and color by aspiration'))
print(p)
}
}
plot_scatter_sp_sz_cl(auto_prices, plotcols, alpha = 0.2)
options(repr.plot.width=6, repr.plot.height=6) # Set the initial plot area dimensions
plot_ggp <- ggpairs(auto_prices,
columns = plotcols,
aes(color = fuel.type, alpha = 0.1),
lower = list(continuous = wrap("points", alpha = 0.3), combo = wrap("facethist", binwidth=0.8)),
upper = list(continuous = ggally_density, combo = wrap("box")),
progress = FALSE)