R
lapply
- functiondplyr
packagelm
functionR
with the lubridate
packagestringr
packagetitanic <- read.csv(file = "./www/titanic.csv", stringsAsFactors = FALSE)
# text format
write.csv(titanic, file = "./www/titanic.csv")
write.table(titanic, file = "./www/titanic.csv",
sep = ";", row.names = FALSE)
# binary 'R' format
saveRDS(titanic, file = "./www/titanic.rds")
save(titanic, file = "./www/titanic.RData")
Homogen | Hetergogen | |
---|---|---|
Eindimensional | vector c() | list list() |
Zweidimensional | matrix matrix() | data.frame data.frame() |
l <- list(values = 1:10, fun = sum, innerlist = list(1, 2))
l
$values
[1] 1 2 3 4 5 6 7 8 9 10
$fun
function (..., na.rm = FALSE) .Primitive("sum")
$innerlist
$innerlist[[1]]
[1] 1
$innerlist[[2]]
[1] 2
length(l)
[1] 3
L[...]
vectors
or data.frames
names
)l[c(TRUE, FALSE, FALSE)]
$values
[1] 1 2 3 4 5 6 7 8 9 10
l[c("innerlist", "fun")]
$innerlist
$innerlist[[1]]
[1] 1
$innerlist[[2]]
[1] 2
$fun
function (..., na.rm = FALSE) .Primitive("sum")
L[[...]]
$
-sign extraction possible (if list has names
)l[[1]]
[1] 1 2 3 4 5 6 7 8 9 10
l$values
[1] 1 2 3 4 5 6 7 8 9 10
l[["fun"]]
function (..., na.rm = FALSE) .Primitive("sum")
l[["fun"]](c(1, 2, 3)) # directly apply the 'sum' function
[1] 6
vector
but two-dimensionaldata.frame
but only one data-type allowed (numeric
, logical
, character
)solve()
%*%
t()
matrix()
function with at least two arguments:
c()
of elementsnrow
or ncol
specificationbyrow = TRUE
mat <- matrix(c(1, 3, 2, 6, 4, 5), ncol = 3)
mat
[,1] [,2] [,3]
[1,] 1 2 4
[2,] 3 6 5
matrix(c(1, 3, 2, 6, 4, 5), ncol = 3, byrow = TRUE)
[,1] [,2] [,3]
[1,] 1 3 2
[2,] 6 4 5
c(ncol(mat), nrow(mat))
[1] 3 2
M[i, j]
vectors
or data.frames
rownames
or colnames
)mat[c(1, 2), c(1, 2)]
[,1] [,2]
[1,] 1 2
[2,] 3 6
apply
: iteratation over the columns or rows of a matrix
lapply
, sapply
: iteration over a vector
, list
or the columns of a data.frame
for
loopssapply(X, FUN, ...)
X
is a list a vector or a data frameFUN
is a function which is applied on each column of a data frame or each entry of the list or each value of a vector...
additional arguments of the function specified in FUN
titanic <- read.csv("./www/titanic.csv", stringsAsFactors = FALSE)
countNA <- function(x){
return(sum(is.na(x)))
}
sapply(titanic, countNA)
X.1 X pclass survived name sex age embarked
0 0 0 0 0 0 263 0
l <- list(rnorm(100), rnorm(100, mean = 3), rnorm(100, mean = -3))
lapply(l, mean)
[[1]]
[1] -0.04727389
[[2]]
[1] 2.999733
[[3]]
[1] -2.786998
for
loopresult <- rep(NA, 100)
for(i in 1:100){
result[i] <- mean(rnorm(n = 1000))
}
sapply
loopresult <- sapply(1:100, function(i) {
mean(rnorm(n = 1000))
})
dplyr
package: Grouping and aggregation of data framesdplyr
package provides a powerfull grammar of data analysis%>%
data.frame
operations:
tidyr
package)library(dplyr)
titanic <- read.csv("./www/titanic.csv", stringsAsFactors = FALSE)
titanic %>%
group_by(pclass, sex) %>%
summarize(survived = mean(survived), meanAge = mean(age, na.rm = TRUE))
# A tibble: 6 x 4
# Groups: pclass [?]
pclass sex survived meanAge
<int> <chr> <dbl> <dbl>
1 1 female 0.965 37.0
2 1 male 0.341 41.0
3 2 female 0.887 27.5
4 2 male 0.146 30.8
5 3 female 0.491 22.2
6 3 male 0.152 26.0
titanic <- read.csv("./www/titanic.csv", stringsAsFactors = FALSE)
model <- lm(survived ~ sex, data = titanic)
summary(model)
Call:
lm(formula = survived ~ sex, data = titanic)
Residuals:
Min 1Q Median 3Q Max
-0.7275 -0.1910 -0.1910 0.2725 0.8090
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.72747 0.01912 38.05 <2e-16 ***
sexmale -0.53648 0.02382 -22.52 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.4127 on 1307 degrees of freedom
Multiple R-squared: 0.2795, Adjusted R-squared: 0.279
F-statistic: 507.1 on 1 and 1307 DF, p-value: < 2.2e-16
titanic$pclass <- as.factor(titanic$pclass)
model <- lm(survived ~ pclass, data = titanic)
summary(model)
Call:
lm(formula = survived ~ pclass, data = titanic)
Residuals:
Min 1Q Median 3Q Max
-0.6192 -0.2553 -0.2553 0.3808 0.7447
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.61920 0.02571 24.084 < 2e-16 ***
pclass2 -0.18959 0.03784 -5.011 6.17e-07 ***
pclass3 -0.36391 0.03102 -11.732 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.4621 on 1306 degrees of freedom
Multiple R-squared: 0.09768, Adjusted R-squared: 0.0963
F-statistic: 70.69 on 2 and 1306 DF, p-value: < 2.2e-16
titanic$embarked <- as.factor(titanic$embarked)
model <- lm(survived ~ pclass + embarked, data = titanic)
summary(model)
Call:
lm(formula = survived ~ pclass + embarked, data = titanic)
Residuals:
Min 1Q Median 3Q Max
-0.6963 -0.3590 -0.2152 0.4475 0.7848
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.00000 0.32405 3.086 0.002072 **
pclass2 -0.14055 0.03923 -3.583 0.000352 ***
pclass3 -0.33732 0.03283 -10.276 < 2e-16 ***
embarkedC -0.30369 0.32559 -0.933 0.351138
embarkedQ -0.32438 0.32819 -0.988 0.323141
embarkedS -0.44750 0.32539 -1.375 0.169283
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.4583 on 1303 degrees of freedom
Multiple R-squared: 0.1145, Adjusted R-squared: 0.1111
F-statistic: 33.68 on 5 and 1303 DF, p-value: < 2.2e-16
pred <- predict.lm(model)
survived_pred <- pred > 0.5
table(titanic$survived, survived_pred)
survived_pred
FALSE TRUE
0 669 140
1 282 218
model <- lm(survived ~ pclass + sex, data = titanic)
new <- data.frame(pclass = c("1", "2"), sex = c("male", "male"))
predict.lm(model, newdata = new, interval = "prediction", level = 0.90)
fit lwr upr
1 0.3941135 -0.2571518 1.045379
2 0.2364034 -0.4149712 0.887778