This cheat sheet serves as a reference for the topics discussed in this course.


Basics

getwd() # current working directory
list.files("./") # list files in the current wd
list.files("../") # list files from one level above wd
list.files("~/") # list files in the home directory
x <- rnorm(n = 100) # simulate random numbers from normal distribution
mean(x)
sd(x)
log(2)
exp(3)
sqrt(4)
pi^2

ls() # objects in the current user environment
length(ls()) # number of objects
is.character(ls())
rm(x) # remove objects

Vectors and Operators

TRUE # logical value
"hi" # string/ character
234 # numeric

is.logical(TRUE) 
is.logical("TRUE") # this is a character string!

pi == "pi" # pi is a numeric variable while 'pi' is a character string

as.numeric(TRUE) # TRUE is 1
as.numeric(FALSE) # FALSE is 0
as.logical(0) # the reverse does also work
as.numeric("hi")
as.numeric("5")
as.character(TRUE)

x <- c(2, 3, 6, 1, 3)
sum(x)
prod(x)
min(x)
max(x)
sort(x)
length(x)
ls() # objects in the current user environment
length(ls()) # number of objects
is.character(ls())
rm(x) # remove objects

rep(c(1, 2, 3), times = 3)
rep(c(1, 2, 3), each = 3)
rep(seq(-5, 5, 2.5), 2)
seq(1, 10, 1)
1:10

x <- 5
x < 5
x > 5
x == 5
x == 5 | x < 2
x == 5 & x < 2
TRUE | FALSE
TRUE & FALSE
y <- c(TRUE, TRUE, FALSE)
all(y)
any(y)
length(y) == 3
sum(y)

x <- rnorm(200)
mean(x < 0)
sum(x > 2*sd(x))

y <- c(NA, 2, 4)
is.na(y)
sum(is.na(y))

Data import and data frames

titanic <- read.csv("./data/titanic.csv", stringsAsFactors = FALSE)
nrow(titanic)
names(titanic)
head(titanic)
summary(titanic)
mean(titanic$survived)
sum(is.na(titanic$age))
range(titanic$age, na.rm = TRUE)
table(titanic$pclass, titanic$sex)

tab <- table(titanic$pclass, titanic$survived)
prop.table(tab, margin = 2)
prop.table(tab, margin = 1)

titanic$isMale <- titanic$sex == "male"
titanic$survived <- as.logical(titanic$survived)
titanic$isChild <- titanic$age < 18

Functions and flow control

countMissings <- function(x) {
  nMissings <- sum(is.na(x))
  return(nMissings)
}

# warnings and errors
x <- c("a", "b", "c", NA)
if (!is.numeric(x)){
    stop("x is not numeric!")
}

if (any(is.na(x))){
  warning("x contains missing values")
}

apply(titanic, 2, countMissings)

Extraction and updating

x <- rnorm(10)
x[x > 0]
x[x < -1 | x > 2]

x[1] # first value
x[-1] # everything except the first value
x[-length(x)] # everything except the last value
x[length(x)] # the last value

which(x > 0) # get the indices of positive entries

# this two expressions give the same result. why?
x[which(x > 0)]
x[x > 0]

removeNA <- function(x){
  xClean <- x[!is.na(x)]
  return(xClean)
}

titanic[titanic$pclass == 1 & titanic$sex == "female", ]

titanic <- titanic[order(titanic$age, decreasing = TRUE), ] # order by age
titanic[1, ] # oldest passenger
titanic[nrow(titanic), ] # younges passenger

# top 10 youngest in first class
firstClass <- titanic[titanic$pclass == 1, ]
youngest <- order(firstClass$age)[1:10]
firstClass[youngest, ]

Linear regression

titanic <- read.csv("./data/titanic.csv", stringsAsFactors = FALSE)
titanic$pclass <- as.factor(titanic$pclass)
model <- lm(survived ~ pclass + age + sex)
pred <- predict.lm(model)
table(titanic$survived, pred > 0.5)