This cheat sheet serves as a reference for the topics discussed in this course.

# Basics

getwd() # current working directory
list.files("./") # list files in the current wd
list.files("../") # list files from one level above wd
list.files("~/") # list files in the home directory
x <- rnorm(n = 100) # simulate random numbers from normal distribution
mean(x)
sd(x)
log(2)
exp(3)
sqrt(4)
pi^2

ls() # objects in the current user environment
length(ls()) # number of objects
is.character(ls())
rm(x) # remove objects

# Vectors and Operators

TRUE # logical value
"hi" # string/ character
234 # numeric

is.logical(TRUE)
is.logical("TRUE") # this is a character string!

pi == "pi" # pi is a numeric variable while 'pi' is a character string

as.numeric(TRUE) # TRUE is 1
as.numeric(FALSE) # FALSE is 0
as.logical(0) # the reverse does also work
as.numeric("hi")
as.numeric("5")
as.character(TRUE)

x <- c(2, 3, 6, 1, 3)
sum(x)
prod(x)
min(x)
max(x)
sort(x)
length(x)
ls() # objects in the current user environment
length(ls()) # number of objects
is.character(ls())
rm(x) # remove objects

rep(c(1, 2, 3), times = 3)
rep(c(1, 2, 3), each = 3)
rep(seq(-5, 5, 2.5), 2)
seq(1, 10, 1)
1:10

x <- 5
x < 5
x > 5
x == 5
x == 5 | x < 2
x == 5 & x < 2
TRUE | FALSE
TRUE & FALSE
y <- c(TRUE, TRUE, FALSE)
all(y)
any(y)
length(y) == 3
sum(y)

x <- rnorm(200)
mean(x < 0)
sum(x > 2*sd(x))

y <- c(NA, 2, 4)
is.na(y)
sum(is.na(y))

# Data import and data frames

titanic <- read.csv("./data/titanic.csv", stringsAsFactors = FALSE)
nrow(titanic)
names(titanic)
summary(titanic)
mean(titanic$survived) sum(is.na(titanic$age))
range(titanic$age, na.rm = TRUE) table(titanic$pclass, titanic$sex) tab <- table(titanic$pclass, titanic$survived) prop.table(tab, margin = 2) prop.table(tab, margin = 1) titanic$isMale <- titanic$sex == "male" titanic$survived <- as.logical(titanic$survived) titanic$isChild <- titanic$age < 18 # Functions and flow control countMissings <- function(x) { nMissings <- sum(is.na(x)) return(nMissings) } # warnings and errors x <- c("a", "b", "c", NA) if (!is.numeric(x)){ stop("x is not numeric!") } if (any(is.na(x))){ warning("x contains missing values") } apply(titanic, 2, countMissings) # Extraction and updating x <- rnorm(10) x[x > 0] x[x < -1 | x > 2] x[1] # first value x[-1] # everything except the first value x[-length(x)] # everything except the last value x[length(x)] # the last value which(x > 0) # get the indices of positive entries # this two expressions give the same result. why? x[which(x > 0)] x[x > 0] removeNA <- function(x){ xClean <- x[!is.na(x)] return(xClean) } titanic[titanic$pclass == 1 & titanic$sex == "female", ] titanic <- titanic[order(titanic$age, decreasing = TRUE), ] # order by age
titanic[1, ] # oldest passenger
titanic[nrow(titanic), ] # younges passenger

# top 10 youngest in first class
firstClass <- titanic[titanic$pclass == 1, ] youngest <- order(firstClass$age)[1:10]
firstClass[youngest, ]

# Linear regression

titanic <- read.csv("./data/titanic.csv", stringsAsFactors = FALSE)
titanic$pclass <- as.factor(titanic$pclass)
model <- lm(survived ~ pclass + age + sex)
pred <- predict.lm(model)
table(titanic\$survived, pred > 0.5)