This cheat sheet serves as a reference for the topics discussed in this course.
getwd() # current working directory
list.files("./") # list files in the current wd
list.files("../") # list files from one level above wd
list.files("~/") # list files in the home directory
x <- rnorm(n = 100) # simulate random numbers from normal distribution
mean(x)
sd(x)
log(2)
exp(3)
sqrt(4)
pi^2
ls() # objects in the current user environment
length(ls()) # number of objects
is.character(ls())
rm(x) # remove objects
TRUE # logical value
"hi" # string/ character
234 # numeric
is.logical(TRUE)
is.logical("TRUE") # this is a character string!
pi == "pi" # pi is a numeric variable while 'pi' is a character string
as.numeric(TRUE) # TRUE is 1
as.numeric(FALSE) # FALSE is 0
as.logical(0) # the reverse does also work
as.numeric("hi")
as.numeric("5")
as.character(TRUE)
x <- c(2, 3, 6, 1, 3)
sum(x)
prod(x)
min(x)
max(x)
sort(x)
length(x)
ls() # objects in the current user environment
length(ls()) # number of objects
is.character(ls())
rm(x) # remove objects
rep(c(1, 2, 3), times = 3)
rep(c(1, 2, 3), each = 3)
rep(seq(-5, 5, 2.5), 2)
seq(1, 10, 1)
1:10
x <- 5
x < 5
x > 5
x == 5
x == 5 | x < 2
x == 5 & x < 2
TRUE | FALSE
TRUE & FALSE
y <- c(TRUE, TRUE, FALSE)
all(y)
any(y)
length(y) == 3
sum(y)
x <- rnorm(200)
mean(x < 0)
sum(x > 2*sd(x))
y <- c(NA, 2, 4)
is.na(y)
sum(is.na(y))
titanic <- read.csv("./data/titanic.csv", stringsAsFactors = FALSE)
nrow(titanic)
names(titanic)
head(titanic)
summary(titanic)
mean(titanic$survived)
sum(is.na(titanic$age))
range(titanic$age, na.rm = TRUE)
table(titanic$pclass, titanic$sex)
tab <- table(titanic$pclass, titanic$survived)
prop.table(tab, margin = 2)
prop.table(tab, margin = 1)
titanic$isMale <- titanic$sex == "male"
titanic$survived <- as.logical(titanic$survived)
titanic$isChild <- titanic$age < 18
countMissings <- function(x) {
nMissings <- sum(is.na(x))
return(nMissings)
}
# warnings and errors
x <- c("a", "b", "c", NA)
if (!is.numeric(x)){
stop("x is not numeric!")
}
if (any(is.na(x))){
warning("x contains missing values")
}
apply(titanic, 2, countMissings)
x <- rnorm(10)
x[x > 0]
x[x < -1 | x > 2]
x[1] # first value
x[-1] # everything except the first value
x[-length(x)] # everything except the last value
x[length(x)] # the last value
which(x > 0) # get the indices of positive entries
# this two expressions give the same result. why?
x[which(x > 0)]
x[x > 0]
removeNA <- function(x){
xClean <- x[!is.na(x)]
return(xClean)
}
titanic[titanic$pclass == 1 & titanic$sex == "female", ]
titanic <- titanic[order(titanic$age, decreasing = TRUE), ] # order by age
titanic[1, ] # oldest passenger
titanic[nrow(titanic), ] # younges passenger
# top 10 youngest in first class
firstClass <- titanic[titanic$pclass == 1, ]
youngest <- order(firstClass$age)[1:10]
firstClass[youngest, ]
titanic <- read.csv("./data/titanic.csv", stringsAsFactors = FALSE)
titanic$pclass <- as.factor(titanic$pclass)
model <- lm(survived ~ pclass + age + sex)
pred <- predict.lm(model)
table(titanic$survived, pred > 0.5)