Hinweis: Die Gliederung zu den Befehlen Und Ergebnissen in R orientiert sich an dem Inhaltsverzeichnis der 17. Auflage der ‘Angewandten Statistik’. Nähere Hinweise zu den verwendeten Formeln sowie Erklärungen zu den Beispielen sind in dem Buch (Ebook) nachzulesen!

zur Druckversion

Hinweis: Die thematische Gliederung zu den aufgeführten Befehlen und Beispielen orientiert sich an dem Inhaltsverzeichnis der 17. Auflage der ‘Angewandten Statistik’. Nähere Hinweise zu den verwendeten Formeln sowie Erklärungen zu den Beispielen sind in dem Buch (Ebook) nachzulesen!

Aktuelle R-Umgebung zu den Beispielen

The R Project for Statistical Computing

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 18363)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=German_Germany.1252  LC_CTYPE=German_Germany.1252   
## [3] LC_MONETARY=German_Germany.1252 LC_NUMERIC=C                   
## [5] LC_TIME=German_Germany.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] compiler_3.6.3  magrittr_1.5    tools_3.6.3     htmltools_0.4.0
##  [5] yaml_2.2.1      Rcpp_1.0.4      stringi_1.4.6   rmarkdown_2.1  
##  [9] knitr_1.28      stringr_1.4.0   xfun_0.12       digest_0.6.25  
## [13] rlang_0.4.5     evaluate_0.14

Download von Datensätzen, die in den folgenden Beispielen verwendet werden:

Cholesterin: cholesterin

Hemmhof: hemmhof

Hemmhof-2: hemmhof2

Propensity-Score: psdata

Hautkrebs: skincancer

Messwiederholungen: lmebroad

Cluster: cluster

Teratogenität: teratogen

Chemotherapie: chemotherapie

Sterbetafel: sterbetafel

8.1 Einführung

Bootstrapping:

x    <- rnorm(50, mean=10, sd=5)

stat <- function(x) quantile(x, probs=0.25); stat(x)
##    25% 
## 8.0496

B    <- 1000; t <- rep(NA, B)
for (i in 1:B) t[i] <- stat(sample(x, replace=TRUE))

E.stat <- mean(t); E.stat                                    # Erwartungswert
## [1] 7.904204

V.stat <- var(t); V.stat                                     # Varianz
## [1] 0.929862

Jackknife:

x     <- rnorm(50, mean=10, sd=5)

theta <- function(x) quantile(x, probs=0.95); theta(x)
##    95% 
## 15.688

n     <- length(x)
t     <- rep(NA, n)
for  (i in 1:n) t[i] = theta(x[-i])

JE.theta <- mean(t); JE.theta                                # Schätzung
## [1] 15.70066

JV.theta <- ((n-1)/n) * sum((t-JE.theta)^2); JV.theta        # Varianz
## [1] 0.9509632

JC.theta <- n*theta(x) - (n-1)*JE.theta; JC.theta            # Korrektur
##     95% 
## 15.0676

Cross validation:

x <- seq(0, 10, by = 0.5)

n <- length(x)                                               # lineares Regressionsmodell
y <- 2 + 0.5*x; y <- y + rnorm(n, mean=0, sd=0.6)
resi <- lm(y~x)$residuals; sum(resi^2)                       # Residuen
## [1] 6.784686

err <- rep(NA, n)
for (i in 1:n) {
    mcf    <- lm(y[-i] ~ x[-i])$coef                         # Koeffizienten
    y.hat  <- mcf[1] + mcf[2]*x[i]                           # Schätzung
    err[i] <- y[i] - y.hat  }                                # Schätz-Fehler

t.err <- sum(err^2); t.err
## [1] 8.174279

8.2 Lineare Regressionsmodelle

8.2.1 Die einfache lineare Regression

Beispiel Cholesterin und Alter:

cholesterin <- read.csv2("cholesterin.CSV", sep=";",dec=",",header=T) 
as.data.frame(cholesterin[1:10,])   
attach(cholesterin)                          

n    <- length(Alter)                                         # Maßzahlen
m.x  <- mean(Alter);                   m.x
## [1] 39.41667
s.x  <- sd(Alter);                     s.x
## [1] 13.41614
m.y  <- mean(Chol);                    m.y
## [1] 3.354167
s.y  <- sd(Chol);                      s.y
## [1] 0.7779455
                                                              # Varianz- und Kovarianz
ss.xx <- sum((Alter - m.x)^2);             ss.xx 
## [1] 4139.833
ss.yy <- sum((Chol  - m.y)^2);             ss.yy
## [1] 13.91958
ss.xy <- sum((Alter - m.x)*(Chol - m.y));  ss.xy
## [1] 217.8583
                                                              # Regressionskoeffizienten
beta1 <- ss.xy / ss.xx;                    beta1
## [1] 0.0526249
beta0 <- m.y - beta1 * m.x;                beta0
## [1] 1.279868
par(lwd=2, font.axis=2, bty="n", ps=11, mfrow=c(1,1))  
plot(Alter, Chol, ylab="Cholesterin", las=1, cex=1.5)        # Punktwolke       
abline(beta0, beta1, col="black")                            # Regressionsgerade

Residuenanalyse:

Chol.e <- beta0 + beta1*Alter; e <- Chol - Chol.e
                                                          
par(lwd=2, font.axis=2, bty="n", ps=11, mfrow=c(1,2)) 
qqnorm(e, xlab = "Normal-Plot", ylab = "Residuen", las=1, cex=1.5,
                ylim=c(-0.6,+0.6), main="" )
lines(c(-2, +2),c(-0.6, +0.6), lty=2)
plot(Chol.e, e, ylab="Residuen", las=1, cex=1.5,
              xlab="Cholesterin geschätzt", ylim=c(-0.6,+0.6))
abline(h=0, lty=2)

Chol.hat <- beta0 + beta1 * Alter

par(lwd=2, font.axis=2, bty="n", ps=12, mfrow=c(1,1)) 
plot(Alter, Chol, ylab="Cholesterin", las=1, cex=1.5)        # Punktwolke       
abline(beta0, beta1, col="black")                            # Regressionsgerade
for (i in 1:n)                                               # Residuen
lines(c(Alter[i],Alter[i]), c(Chol[i],Chol.hat[i]), col="red", lty=2)

Varianzkomponenten (elementar):

Chol.hat <- beta0 + beta1 * Alter
                                                                 
ss.regression <- sum((Chol.hat - m.y)^2);    ss.regression   
## [1] 11.46477
ms.regression <- ss.regression
ss.residual   <- sum((Chol.hat - Chol)^2);   ss.residual
## [1] 2.454809
ms.residual   <- ss.residual/(n-2)
F.ratio       <- ms.regression/ms.residual;  F.ratio
## [1] 102.7473

ss.regression + ss.residual;                 ss.yy                              
## [1] 13.91958
## [1] 13.91958
                                                                   
r.cor <- sqrt(ss.regression/ss.yy);           r.cor          # Korrelationskoeffizient 
## [1] 0.9075481

cor(Alter, Chol)
## [1] 0.9075481

Statistische Eigenschaften der Schätzung (Matrizen):

Funktion ginv() in library(MASS)

library(MASS)
y <- Chol;  x <- Alter;   n <- length(x)                   
r <- Chol - Chol.hat

data <- cbind(x, y, r); as.data.frame(data[1:10,])

X <- cbind(rep(1,n), x)                   

SSx <- ginv(t(X)%*%X)                                        # SAQ x

beta <- SSx %*% t(X) %*% y; beta                             # Koeffizienten  
##           [,1]
## [1,] 1.2798684
## [2,] 0.0526249

SSr <- t(r)%*%r; SSr                                         # SAQ in den Residuen   
##          [,1]
## [1,] 2.454809

MSr <- SSr/(n-2); MSr                                        # MSQ in den Residuen     
##           [,1]
## [1,] 0.1115822

sqrt(SSr/(n-2))*sqrt(SSx[2,2])                               # Standardfehler von beta1
##             [,1]
## [1,] 0.005191658

sqrt(SSr/(n-2))*sqrt(SSx[1,1])                               # Standardfehler von beta0
##           [,1]
## [1,] 0.2156987

SSy <-  t(beta) %*% t(X) %*% y
MSy <- SSy - n*(mean(y)^2); MSy                              # MSQ in der Regression  
##          [,1]
## [1,] 11.46477

Lineares Modell in R mit der Funktion lm():

lin.model <- lm(Chol ~ Alter)
summary(lin.model)
## 
## Call:
## lm(formula = Chol ~ Alter)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6111 -0.2151 -0.0058  0.2297  0.6256 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.279868   0.215699   5.934 5.69e-06 ***
## Alter       0.052625   0.005192  10.136 9.43e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.334 on 22 degrees of freedom
## Multiple R-squared:  0.8236, Adjusted R-squared:  0.8156 
## F-statistic: 102.7 on 1 and 22 DF,  p-value: 9.428e-10

anova(lin.model)

8.2.2 Multiple lineare Regression

Beispiel Körpergewicht, Hirngewicht und Wurfgröße bei Mäusen (data(litters) in library(DAAG)):

library(DAAG)
data(litters)
attach(litters)

litters[1:10,]
library(DAAG)                            
data(litters)
attach(litters)

par(mfrow=c(1,2), lwd=2, font.axis=2, bty="l", ps=12)
plot(bodywt, lsize, ylab="Größe des Wurfes", 
                xlab="Körpergewicht", cex=1.3)
plot(brainwt, lsize, ylab="Größe des Wurfes", 
                xlab="Gehirngewicht", cex=1.3)


par(mfrow=c(1,1), lwd=2, font.axis=2, bty="o", ps=12) 
pairs(litters, labels=c("Größe des Wurfes","Körpergewicht","Gehirngewicht"))

y <- lsize;  n <- length(y)         
X <- matrix(c(rep(1,20),bodywt,brainwt),nrow=20,
            dimnames = list(1:20, c("one", "bodywt", "brainwt"))); p <- 2 

t(X) %*% X                                           # Matrixprodukt                     
##             one     bodywt   brainwt
## one      20.000  154.96000  8.335000
## bodywt  154.960 1235.85592 64.948762
## brainwt   8.335   64.94876  3.480561
xtxi <- solve(t(X) %*% X); xtxi   
##                  one      bodywt     brainwt
## one       38.3034161  0.92161959 -108.924114
## bodywt     0.9216196  0.06404389   -3.402116
## brainwt -108.9241143 -3.40211562  324.615971
b.h <- xtxi %*% t(X) %*% y; b.h                      #  Parameterschätzung           
##              [,1]
## one     12.898778
## bodywt  -2.398031
## brainwt 31.628479
H   <- X %*% xtxi %*% t(X)                           # Hut-Matrix         
 
y.h <- H %*% y;                        
e.h <- y - y.h;                                      # Schätzfehler - Residuen          
cbind(y, y.h, e.h)[1:10,]
##    y                    
## 1  3 4.287621 -1.2876207
## 2  3 3.236048 -0.2360484
## 3  4 4.133877 -0.1338770
## 4  4 3.415120  0.5848797
## 5  5 5.118304 -0.1183044
## 6  5 3.580457  1.4195432
## 7  6 5.777820  0.2221804
## 8  6 6.297299 -0.2972987
## 9  7 8.089394 -1.0893942
## 10 7 6.479804  0.5201956
RSS <- t(e.h) %*% e.h; RSS                           # Summe über die quadr. Abweichungen
##          [,1]
## [1,] 11.48166

s.h <- sqrt(RSS/(n-p-1))                             # Schätzung der Standardabweichung 

se.b  <- sqrt(diag(xtxi))*s.h; se.b                  # Standardfehler der Schätzung     
##        one     bodywt    brainwt 
##  5.0862375  0.2079777 14.8068549

R <- 1 - RSS/sum((y-mean(y))^2);R                    # Berechnung des Bestimmtheitsmaßes
##           [,1]
## [1,] 0.9304142

Lineares Modell in R mit der Funktion lm():

fit <- lm(lsize ~ bodywt + brainwt, data=litters)
summary(fit)
## 
## Call:
## lm(formula = lsize ~ bodywt + brainwt, data = litters)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.2876 -0.3445 -0.1261  0.5230  1.5679 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   12.899      5.086   2.536   0.0213 *  
## bodywt        -2.398      0.208 -11.530 1.85e-09 ***
## brainwt       31.628     14.807   2.136   0.0475 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8218 on 17 degrees of freedom
## Multiple R-squared:  0.9304, Adjusted R-squared:  0.9222 
## F-statistic: 113.7 on 2 and 17 DF,  p-value: 1.45e-10

8.2.4 Analyse der Residuen im linearen Modell

Beispiel Körpergewicht, Hirngewicht und Wurfgröße bei Mäusen (data(litters) in library(DAAG)):

library(DAAG)                                            # Influence - Diagnostik 
data(litters)
                                                       
fit <- lm(lsize ~ bodywt + brainwt, data=litters)
res.diag <- influence.measures(fit)               
round(cooks.distance(fit), 3)
##     1     2     3     4     5     6     7     8     9    10    11    12    13 
## 0.187 0.007 0.002 0.040 0.001 0.207 0.006 0.008 0.039 0.011 0.050 0.006 0.014 
##    14    15    16    17    18    19    20 
## 0.005 0.048 0.003 0.535 0.000 0.152 0.075
                                                        # Beobachtungen mit 'Einfluss'    
infl <- as.numeric(which(apply(res.diag$is.inf,1,any))); infl  
## [1] 17 19

par(lwd=2, font.axis=2, bty="n", ps=11, mfrow=c(1,1)) 
plot(litters$lsize ~ hatvalues(fit), ylim=c(0,12), 
        ylab="Gr??e des Wurfes", xlab = "Leverage")
points(hatvalues(fit)[infl], litters$lsize[infl], cex=2, pch=4)

8.2.5 Heteroskedastizität im linearen Modell

Beispiel Ausgaben für den Urlaub:

x <- c(1516, 1416, 1205, 1547,  829, 1669,  476, 1448, 1982,  978, 1567, 1401,
       1494, 2069, 1609, 1465, 1491, 1598, 1708, 1597, 1446, 1074, 1525, 2135, 1723)
y <- c(771,  868,  912,  728,  327,  959,  249,  700, 1650,  570, 1159,  517,  905,
       792, 1039, 830,  866,  993, 1241, 1164,  821,  776,  600, 1596,  578)

mod <- lm(y~x); summary(mod)                          # lineares Regressionsmodell
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -465.58 -118.27   30.38  181.12  450.36 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -120.4336   196.7850  -0.612    0.547    
## x              0.6660     0.1294   5.148 3.23e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 230.4 on 23 degrees of freedom
## Multiple R-squared:  0.5354, Adjusted R-squared:  0.5152 
## F-statistic:  26.5 on 1 and 23 DF,  p-value: 3.234e-05

par(mfcol=c(1,2), lwd=2.5, font.axis=2, bty="l", ps=13)  
plot(x, y, xlab="X [Einkommen]", ylab="Y [Aufwendungen für den Urlaub]", 
     las=1, ylim=c(0,1800), yaxp=c(0,1800,9))
abline(mod, lty=2)
abline(a=-100, b=1.00, lty=3)
abline(a=-100, b=0.32, lty=3)
mod <- lm(y~x); res <- residuals(mod)
plot(x, res, xlab="X [Einkommen]", ylab="Residuen [Modellabweichungen]", 
     las=1, ylim=c(-500,500), yaxp=c(-500,+500,10))
abline(h=0, lty=2)
abline(a=-50, b=0.30, lty=3)
abline(a=50, b=-0.30, lty=3)

Funktion hccm() in library(car)

library(car)
v <- vcov(mod); sqrt(c(v[1,1], v[2,2]))               # Kovarianzmatrix 
## [1] 196.7849979   0.1293777
v <- hccm(mod, type="hc0"); sqrt(c(v[1,1], v[2,2]))   # korrigiert nach HC0
## [1] 181.8968175   0.1403881
v <- hccm(mod, type="hc1"); sqrt(c(v[1,1], v[2,2]))   # korrigiert nach HC1
## [1] 189.6405416   0.1463647
v <- hccm(mod, type="hc2"); sqrt(c(v[1,1], v[2,2]))   # korrigiert nach HC2
## [1] 196.5128558   0.1515079
v <- hccm(mod, type="hc3"); sqrt(c(v[1,1], v[2,2]))   # korrigiert nach HC3
## [1] 212.7195847   0.1637564

8.2.6 Hypothesentest und Konfidenzintervalle

Beispiel Körpergewicht, Hirngewicht und Wurfgröße bei Mäusen (data(litters) in library(DAAG)):

library(DAAG)                          
data(litters)

fit <- lm(lsize ~ bodywt + brainwt, data=litters)
RSS <- sum(fit$res^2)
SYY <- sum((litters$lsize - mean(litters$lsize))^2)
p   <- 2;   n <- 20
F   <- ((SYY-RSS)/p)/(RSS/(n-p-1)); F
## [1] 113.6513
p   <- 1-pf(F, p, n-p-1); p
## [1] 1.450194e-10
fit <- lm(lsize ~ bodywt + brainwt, data=litters)
p   <- 2;   n <- 20
x0  <- c(1.0, 8.0, 0.4)                             # einzelne Beobachtung         
y0  <- sum(x0*fit$coef)                             # Schätzung der Wurfgröße     
X   <- cbind(1, bodywt, brainwt)                    # Aufbau der X-Matrix          
xtxi  <- solve(t(X) %*% X)                          # Varianz-Matrix               

t     <- qt(0.975, n-p-1)                           # Quantil der t-Verteilung     
sigma <- sqrt(sum(fit$res^2)/(n-p-1))               # Schätzung Standardabweichung
W     <- sqrt(1 + x0 %*% xtxi %*% x0)               # Wurzelterm                   
round(c(y0-t*sigma*W, y0, y0+t*sigma*W), 2)
## [1] 4.49 6.37 8.24

8.2.6 Verfahren der Variablenauswahl

Beispiel Körpergewicht, Hirngewicht und Wurfgröße bei Mäusen (data(litters) in library(DAAG)):

library(DAAG)                                         
data(litters)

fit <- lm(lsize ~ ., data=litters)
drop1(fit, test="F")
fit <- lm(lsize ~ 1, data=litters)

add1(fit, ~ bodywt + brainwt, test="F")
fit <- lm(lsize ~ ., data=litters)

step(fit)
## Start:  AIC=-5.1
## lsize ~ bodywt + brainwt
## 
##           Df Sum of Sq     RSS    AIC
## <none>                  11.482 -5.100
## - brainwt  1     3.082  14.563 -2.345
## - bodywt   1    89.791 101.272 36.442
## 
## Call:
## lm(formula = lsize ~ bodywt + brainwt, data = litters)
## 
## Coefficients:
## (Intercept)       bodywt      brainwt  
##      12.899       -2.398       31.628

8.3 Varianzanalyse im linearen Modell

8.3.1 Einfaktorielle Varianzanalyse

Beispiel Wirksamkeit Antibiotika:

hemmhof <- read.csv2("hemmhof.CSV") 
hemmhof <- as.data.frame(hemmhof)
hemmhof[1:10,]
summary(hemmhof)
##  Antibiotikum      Wert      
##  A:4          Min.   : 6.80  
##  B:5          1st Qu.:11.07  
##  C:3          Median :13.65  
##               Mean   :13.49  
##               3rd Qu.:16.48  
##               Max.   :19.30

8.3.1.1 Erwartungswert-Parametrisierung

attach(hemmhof)

Antibiotikum <- as.factor(c(rep("A",4), rep("B",5), rep("C",3)))
Antibiotikum
##  [1] A A A A B B B B B C C C
## Levels: A B C
Wert <- c(13.2,14.1,7.8,11.7,15.9,16.2,19.3,18.0,17.3,6.8,9.2,12.4)
Wert
##  [1] 13.2 14.1  7.8 11.7 15.9 16.2 19.3 18.0 17.3  6.8  9.2 12.4
                                                             
fit <- lm(Wert ~ Antibiotikum - 1, data=hemmhof)            # Erwartungswerte
summary(fit)
## 
## Call:
## lm(formula = Wert ~ Antibiotikum - 1, data = hemmhof)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.900 -1.215 -0.020  1.615  2.933 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## AntibiotikumA   11.700      1.138  10.277 2.85e-06 ***
## AntibiotikumB   17.340      1.018  17.029 3.73e-08 ***
## AntibiotikumC    9.467      1.315   7.201 5.08e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.277 on 9 degrees of freedom
## Multiple R-squared:  0.9803, Adjusted R-squared:  0.9737 
## F-statistic: 149.2 on 3 and 9 DF,  p-value: 5.445e-08
model.matrix(fit)
##    AntibiotikumA AntibiotikumB AntibiotikumC
## 1              1             0             0
## 2              1             0             0
## 3              1             0             0
## 4              1             0             0
## 5              0             1             0
## 6              0             1             0
## 7              0             1             0
## 8              0             1             0
## 9              0             1             0
## 10             0             0             1
## 11             0             0             1
## 12             0             0             1
## attr(,"assign")
## [1] 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$Antibiotikum
## [1] "contr.treatment"

8.3.1.2 Effekt-Parametrisierung: Dummy-Codierung

attach(hemmhof)

fit <- lm(Wert ~ Antibiotikum, contrasts = list(Antibiotikum="contr.treatment"))
summary(fit)
## 
## Call:
## lm(formula = Wert ~ Antibiotikum, contrasts = list(Antibiotikum = "contr.treatment"))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.900 -1.215 -0.020  1.615  2.933 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     11.700      1.138  10.277 2.85e-06 ***
## AntibiotikumB    5.640      1.527   3.693  0.00498 ** 
## AntibiotikumC   -2.233      1.739  -1.284  0.23113    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.277 on 9 degrees of freedom
## Multiple R-squared:  0.7438, Adjusted R-squared:  0.6869 
## F-statistic: 13.07 on 2 and 9 DF,  p-value: 0.002179
contr.treatment(3, base = 1, contrasts = TRUE)             # Codierung von Kontrasten
##   2 3
## 1 0 0
## 2 1 0
## 3 0 1

summary(aov(Wert ~ Antibiotikum))
##              Df Sum Sq Mean Sq F value  Pr(>F)   
## Antibiotikum  2 135.49   67.75   13.07 0.00218 **
## Residuals     9  46.66    5.18                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model.matrix(fit)
##    (Intercept) AntibiotikumB AntibiotikumC
## 1            1             0             0
## 2            1             0             0
## 3            1             0             0
## 4            1             0             0
## 5            1             1             0
## 6            1             1             0
## 7            1             1             0
## 8            1             1             0
## 9            1             1             0
## 10           1             0             1
## 11           1             0             1
## 12           1             0             1
## attr(,"assign")
## [1] 0 1 1
## attr(,"contrasts")
## attr(,"contrasts")$Antibiotikum
## [1] "contr.treatment"

8.3.1.3 Effekt-Parametrisierung: Effekt-Codierung

fit <- lm(Wert ~ Antibiotikum, contrasts = list(Antibiotikum="contr.sum"))

summary(fit)
## 
## Call:
## lm(formula = Wert ~ Antibiotikum, contrasts = list(Antibiotikum = "contr.sum"))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.900 -1.215 -0.020  1.615  2.933 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    12.8356     0.6717  19.108 1.36e-08 ***
## Antibiotikum1  -1.1356     0.9398  -1.208 0.257729    
## Antibiotikum2   4.5044     0.8927   5.046 0.000694 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.277 on 9 degrees of freedom
## Multiple R-squared:  0.7438, Adjusted R-squared:  0.6869 
## F-statistic: 13.07 on 2 and 9 DF,  p-value: 0.002179
contr.sum(3, contrasts = TRUE)                            # Codierung von Kontrasten
##   [,1] [,2]
## 1    1    0
## 2    0    1
## 3   -1   -1
model.matrix(fit)
##    (Intercept) Antibiotikum1 Antibiotikum2
## 1            1             1             0
## 2            1             1             0
## 3            1             1             0
## 4            1             1             0
## 5            1             0             1
## 6            1             0             1
## 7            1             0             1
## 8            1             0             1
## 9            1             0             1
## 10           1            -1            -1
## 11           1            -1            -1
## 12           1            -1            -1
## attr(,"assign")
## [1] 0 1 1
## attr(,"contrasts")
## attr(,"contrasts")$Antibiotikum
## [1] "contr.sum"

8.3.1.4 Varianzkomponenten (ANOVA)

Funktion glht() in library(multcomp)

library(multcomp)

anova(lm(Wert~Antibiotikum))                              #  Varianzkomponenten

Simultane Konfidenzintervalle:

d    <- data.frame(Gruppe = factor(Antibiotikum), Wert)
amod <- aov(Wert ~ Antibiotikum, data=d)
summary(glht(amod, linfct = mcp(Antibiotikum = "Tukey")))
## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Multiple Comparisons of Means: Tukey Contrasts
## 
## 
## Fit: aov(formula = Wert ~ Antibiotikum, data = d)
## 
## Linear Hypotheses:
##            Estimate Std. Error t value Pr(>|t|)   
## B - A == 0    5.640      1.527   3.693  0.01227 * 
## C - A == 0   -2.233      1.739  -1.284  0.43720   
## C - B == 0   -7.873      1.663  -4.735  0.00259 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)
par(mfrow=c(1,1), lwd=2, font.axis=2, bty="n", ps=11) 
plot(glht(amod, linfct = mcp(Antibiotikum = "Tukey")), 
     main="Tukey-Kontraste",  xlab="Hemmhof-Durchmesser (95%-KI)")

8.3.2 Zweifaktorielle Varianzanalyse

Beispiel Wirksamkeit Antibiotika (2):

hemmhof <- read.csv2("hemmhof2.CSV") 
hemmhof <- as.data.frame(hemmhof)
hemmhof[1:10,]
hemmhof <- read.csv2("hemmhof2.CSV") 
hemmhof <- as.data.frame(hemmhof)
par(mfrow=c(1,1), lwd=2, font.axis=2, bty="l", ps=12) 
interaction.plot(hemmhof$Antibiotikum, hemmhof$Konz, hemmhof$Wert)

Modell ohne Interaktion:

fit <- lm(Wert~ Antibiotikum + Konz, data=hemmhof,
          contrasts=list(Antibiotikum="contr.sum", Konz="contr.sum"))   
summary(fit) 
## 
## Call:
## lm(formula = Wert ~ Antibiotikum + Konz, data = hemmhof, contrasts = list(Antibiotikum = "contr.sum", 
##     Konz = "contr.sum"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.6077 -2.0758  0.3299  1.9953  5.4558 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    12.6240     0.6344  19.900 3.49e-14 ***
## Antibiotikum1  -1.8356     0.9167  -2.002  0.05973 .  
## Antibiotikum2   2.6336     0.8612   3.058  0.00647 ** 
## Konz1          -0.5817     0.6350  -0.916  0.37109    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.019 on 19 degrees of freedom
## Multiple R-squared:  0.3628, Adjusted R-squared:  0.2622 
## F-statistic: 3.607 on 3 and 19 DF,  p-value: 0.03242

anova(fit)
model.matrix(fit)
##    (Intercept) Antibiotikum1 Antibiotikum2 Konz1
## 1            1             1             0    -1
## 2            1             1             0    -1
## 3            1             1             0    -1
## 4            1             1             0    -1
## 5            1             0             1    -1
## 6            1             0             1    -1
## 7            1             0             1    -1
## 8            1             0             1    -1
## 9            1             0             1    -1
## 10           1            -1            -1    -1
## 11           1            -1            -1    -1
## 12           1            -1            -1    -1
## 13           1             1             0     1
## 14           1             1             0     1
## 15           1             1             0     1
## 16           1             0             1     1
## 17           1             0             1     1
## 18           1             0             1     1
## 19           1             0             1     1
## 20           1            -1            -1     1
## 21           1            -1            -1     1
## 22           1            -1            -1     1
## 23           1            -1            -1     1
## attr(,"assign")
## [1] 0 1 1 2
## attr(,"contrasts")
## attr(,"contrasts")$Antibiotikum
## [1] "contr.sum"
## 
## attr(,"contrasts")$Konz
## [1] "contr.sum"

Test auf Interaktion:

fit1 <- update(fit, . ~ . + Antibiotikum:Konz)          
anova(fit, fit1)

Modell mit Interaktion:

fit <- lm(Wert~ Antibiotikum + Konz + Antibiotikum:Konz, data=hemmhof)         
anova(fit)

8.3 Logistische Regression

Logit-Transformation:

p        <- seq(0, 1, 0.02)                       
logit.p  <- log(p/(1-p))
x        <- seq(-5, +5, 0.2)
ilogit.x <- 1 / (1+exp(-x))

par(mfrow=c(1,2), lwd=2, font.axis=2, bty="l", ps=11) 
plot(p, logit.p, type="b", xlim=c(0,1), ylim=c(-4, +4),
     xlab= expression(paste("Wahrscheinlichkeit  ", pi)), 
     ylab= expression(log(pi / (1-pi))))
plot(x, ilogit.x, type="b", xlim=c(-5,+5), ylim=c(0,1),
     xlab= expression(paste("Wert x")), 
     ylab= expression(paste("Wahrscheinlichkeit  ", 1/(1 + exp(-x)))))

Beispiel Challenger-Unglück:

t <- c(66,67,68,70,72,75,76,79,53,58,70,75,67,67,69,70,73,76,78,81,57,63,70)
d <- c( 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1 ,1)

fit <- glm(d ~ t, family=binomial)
summary(fit)
## 
## Call:
## glm(formula = d ~ t, family = binomial)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0611  -0.7613  -0.3783   0.4524   2.2175  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  15.0429     7.3786   2.039   0.0415 *
## t            -0.2322     0.1082  -2.145   0.0320 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 28.267  on 22  degrees of freedom
## Residual deviance: 20.315  on 21  degrees of freedom
## AIC: 24.315
## 
## Number of Fisher Scoring iterations: 5
fit$coef
## (Intercept)           t 
##  15.0429016  -0.2321627
temp <- seq(30,90,by=2)
lin  <- fit$coef[1] + fit$coef[2]*temp
prob <- exp(lin) / (1 + exp(lin))
par(mfrow=c(1,2), lwd=2, font.axis=2, bty="l", ps=11) 

boxplot(t ~ d, ylab="Temperatur (F)", xlab="Ausfall", las=1, col="grey")
plot(temp, prob, type="b", las=1, xlab="Temperatur (F)",
                  ylab="Ausfallwahrscheinlichkeit")

8.4.1 Hypothesentest im linearen Modell

anova(fit, test="Chi")

8.4.2 Multiple logistische Regression

Beispiel Wirbelsäulenverkrümmung (data(kyphosis) in library(rpart))

library(rpart)                                  
attach(kyphosis)

par(mfrow=c(1,3), lwd=2, font.axis=2, bty="l", ps=12) 
boxplot(Age~Kyphosis, ylab="Alter", xlab="Kyhosis", main="A")
boxplot(Number~Kyphosis, ylab="Number", xlab="Kyphosis", main="B")
boxplot(Start~Kyphosis, ylab="Start", xlab="Kyphosis", main="C")


fit <- glm(Kyphosis ~ Age + Number + Start, family="binomial", data=kyphosis)
summary(fit)
## 
## Call:
## glm(formula = Kyphosis ~ Age + Number + Start, family = "binomial", 
##     data = kyphosis)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3124  -0.5484  -0.3632  -0.1659   2.1613  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -2.036934   1.449575  -1.405  0.15996   
## Age          0.010930   0.006446   1.696  0.08996 . 
## Number       0.410601   0.224861   1.826  0.06785 . 
## Start       -0.206510   0.067699  -3.050  0.00229 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 83.234  on 80  degrees of freedom
## Residual deviance: 61.380  on 77  degrees of freedom
## AIC: 69.38
## 
## Number of Fisher Scoring iterations: 5
anova(fit, test="Chi")
fit1 <- update(fit, . ~ .- Age)
anova(fit, fit1, test="Chi")

8.4.3 Interpretation der Modellparameter

Modellrechnungen - Vorhersage:

fit <- glm(Kyphosis ~ Age + Number + Start, family="binomial", data=kyphosis)

new.d <- data.frame(Age=c(12,24,60), Number=c(2, 4, 6), Start=c(15, 10, 5))
new.p <- round(predict(fit, new.d, type = "response"), 4)

cbind(new.d, new.p)

Funktion nomogram() in library(rms):

Beispiel Wirbelsäulenverkrümmung (data(kyphosis) in library(rpart))

library(rpart); library(rms)
data(kyphosis); df <- kyphosis

ddist <- datadist(df); options(datadist='ddist')
fit <- lrm(Kyphosis ~ Age + Number + Start, data=df, x=TRUE, y=TRUE)

nom <- nomogram(fit, fun=function(x)1/(1+exp(-x)), 
                fun.at=c(.01,.05,seq(.1,.9,by=.1),.95,.99),
                lp=FALSE, funlabel="Risiko für Kyphose")

plot(nom, xfrac=.45)


print(nom)
## Points per unit of linear predictor: NaN 
## Linear predictor units per point   : NaN 
## 
## 
##  Age Points
##    0  0    
##   20  6    
##   40 12    
##   60 18    
##   80 24    
##  100 29    
##  120 35    
##  140 41    
##  160 47    
##  180 53    
##  200 59    
##  220 65    
## 
## 
##  Number Points
##   2      0    
##   3     11    
##   4     22    
##   5     33    
##   6     44    
##   7     55    
##   9     77    
##  10     88    
## 
## 
##  Start Points
##   0    100   
##   2     89   
##   4     78   
##   6     67   
##   8     56   
##  10     44   
##  12     33   
##  14     22   
##  16     11   
##  18      0   
## 
## 
##  Total Points Risiko für Kyphose
##             9               0.01
##            53               0.05
##            74               0.10
##            95               0.20
##           110               0.30
##           122               0.40
##           133               0.50
##           144               0.60
##           155               0.70
##           170               0.80
##           192               0.90
##           212               0.95

8.4.4 Variablenauswahl (Schrittweise)

Funktion stepAIC() in library(MASS):

Beispiel Wirbelsäulenverkrümmung (data(kyphosis) in library(rpart))

library(rpart); data(kyphosis)                                    
library(MASS)

model <- glm(Kyphosis ~ 1,  family = binomial, data = kyphosis); model
## 
## Call:  glm(formula = Kyphosis ~ 1, family = binomial, data = kyphosis)
## 
## Coefficients:
## (Intercept)  
##      -1.326  
## 
## Degrees of Freedom: 80 Total (i.e. Null);  80 Residual
## Null Deviance:       83.23 
## Residual Deviance: 83.23     AIC: 85.23

model.step <- stepAIC(model, Kyphosis ~ Age + Number + Start, 
                                        trace = FALSE, direction="both")
model.step$anova

8.4.5 Residuenanalyse

Beispiel Wirbelsäulenverkrümmung (data(kyphosis) in library(rpart))

library(rpart); data(kyphosis) 

fit <- glm(Kyphosis ~ Age + Number + Start, data=kyphosis, family="binomial")
deviance.resid <- residuals(fit)
pearson.resid  <- residuals(fit, type="pearson")
hats <- influence(fit)$hat
idev <- deviance.resid^2 + pearson.resid^2 * hats/(1-hats)

par(mfrow=c(1,3), lwd=2, font.axis=2, bty="l", ps=14)
plot(deviance.resid, xlab="Beobachtung", ylab="Residuen nach der Devianz")
plot(pearson.resid, xlab="Beobachtung", ylab="Residuen nach Pearson")
plot(idev, xlab="Beobachtung",
           ylab="Einflussnahme (influence points)", type="b")