Resma3 vs Basic R

In this section we will see how some of our problems could be done with base R.

Graphs

Histogram

x <- rnorm(1000, 10, 2)
hplot(x, n=50)

hist(x, 50)

Boxplot

bplot(x)

boxplot(x)

attach(mothers)
bplot(Length, Status)

boxplot(Length~Status)

Scatterplot

attach(wine)
splot(Heart.Disease.Deaths, Wine.Consumption)

plot(Wine.Consumption, Heart.Disease.Deaths)

Summary Statistics

fivenumber(x, ndigit = 2)

##  Minimum   Q1 Median    Q3 Maximum
##        4 8.76   9.89 11.16   16.77
## IQR =  2.4

round(c(min(x), quantile(x, 0.25), median(x), quantile(x, 0.75), max(x)), 2)

##         25%         75%       
##  4.00  8.76  9.89 11.16 16.77

stat.table(x, ndigit = 2)

##   Sample Size Mean Standard Deviation
## x        1000 9.89               1.94

round(c(length(x), mean(x), sd(x)), 2)

## [1] 1000.00    9.89    1.94

Confidence Intervals/Hypothesis Tests

Mean

one.sample.t(x, conf.level = 90,  ndigit = 3)

## A 90% confidence interval for the population mean is (9.794, 9.996)

t.test(x, conf.level = 0.9)

## 
##  One Sample t-test
## 
## data:  x
## t = 161.47, df = 999, p-value < 0.00000000000000022
## alternative hypothesis: true mean is not equal to 0
## 90 percent confidence interval:
##  9.794011 9.995797
## sample estimates:
## mean of x 
##  9.894904

one.sample.t(x, mu.null = 10, 
             alternative = "greater", ndigit = 3)

## p value of test H0: mu=10 vs. Ha: mu > 10:  0.0433

t.test(x, mu=10, alternative = "greater")

## 
##  One Sample t-test
## 
## data:  x
## t = -1.715, df = 999, p-value = 0.9567
## alternative hypothesis: true mean is greater than 10
## 95 percent confidence interval:
##  9.794011      Inf
## sample estimates:
## mean of x 
##  9.894904

the t.ps command does not exist in base R.

Proportion

one.sample.prop(60, 100, conf.level = 90,  ndigit = 3)

## A 90% confidence interval for the population proportion is (0.513, 0.682)

prop.test(60, 100, conf.level = 0.9)

## 
##  1-sample proportions test with continuity correction
## 
## data:  60 out of 100, null probability 0.5
## X-squared = 3.61, df = 1, p-value = 0.05743
## alternative hypothesis: true p is not equal to 0.5
## 90 percent confidence interval:
##  0.5127842 0.6816248
## sample estimates:
##   p 
## 0.6

one.sample.prop(60, 100, pi.null = 0.5, 
             alternative = "greater", ndigit = 3)

## p value of test H0: pi=0.5 vs. Ha: pi > 0.5:  0.0287

prop.test(60, 100, p=0.5, alternative = "greater")

## 
##  1-sample proportions test with continuity correction
## 
## data:  60 out of 100, null probability 0.5
## X-squared = 3.61, df = 1, p-value = 0.02872
## alternative hypothesis: true p is greater than 0.5
## 95 percent confidence interval:
##  0.5127842 1.0000000
## sample estimates:
##   p 
## 0.6

the prop.ps command does not exist in base R.

Correlation

attach(draft)
pearson.cor(Draft.Number, Day.of.Year, conf.level = 90)

## A 90% confidence interval for the 
## population correlation coefficient is ( -0.306, -0.143 )

cor.test(Draft.Number, Day.of.Year, conf.level = 0.9)

## 
##  Pearson's product-moment correlation
## 
## data:  Draft.Number and Day.of.Year
## t = -4.4272, df = 364, p-value = 0.00001264
## alternative hypothesis: true correlation is not equal to 0
## 90 percent confidence interval:
##  -0.3061994 -0.1427007
## sample estimates:
##        cor 
## -0.2260414

pearson.cor(Draft.Number, Day.of.Year, rho.null = 0)

## p value of test H0: rho=0 vs. Ha: rho <> 0:  0.000

Regression

Simple Regression

slr(Draft.Number, Day.of.Year)

## The least squares regression equation is: 
##  Draft.Number  = 225.009 - 0.226 Day.of.Year 
## R^2 = 5.11%

summary(lm(Draft.Number~Day.of.Year))

## 
## Call:
## lm(formula = Draft.Number ~ Day.of.Year)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -210.837  -85.629   -0.519   84.612  196.157 
## 
## Coefficients:
##              Estimate Std. Error t value             Pr(>|t|)
## (Intercept) 225.00922   10.81197  20.811 < 0.0000000000000002
## Day.of.Year  -0.22606    0.05106  -4.427            0.0000126
## 
## Residual standard error: 103.2 on 364 degrees of freedom
## Multiple R-squared:  0.05109,    Adjusted R-squared:  0.04849 
## F-statistic:  19.6 on 1 and 364 DF,  p-value: 0.00001264

Multiple Regression

attach(houseprice)
mlr(Price, houseprice[, -1])

## The least squares regression equation is: 
##  Price  =  -67.62 + 0.086 Sqfeet - 26.493 Floors - 9.286 Bedrooms + 37.381 Baths 
## R^2 = 88.6%

summary(lm(Price ~ ., data=houseprice))

## 
## Call:
## lm(formula = Price ~ ., data = houseprice)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -23.018  -5.943   1.860   5.947  30.955 
## 
## Coefficients:
##              Estimate Std. Error t value     Pr(>|t|)
## (Intercept) -67.61984   17.70818  -3.819     0.000882
## Sqfeet        0.08571    0.01076   7.966 0.0000000462
## Floors      -26.49306    9.48952  -2.792     0.010363
## Bedrooms     -9.28622    6.82985  -1.360     0.187121
## Baths        37.38067   12.26436   3.048     0.005709
## 
## Residual standard error: 13.71 on 23 degrees of freedom
## Multiple R-squared:  0.8862, Adjusted R-squared:  0.8665 
## F-statistic:  44.8 on 4 and 23 DF,  p-value: 0.0000000001558

Best Subset Regression

library(leaps)
mallows(Price, houseprice[, -1])

##  Number of Variables Cp   Sqfeet Floors Bedrooms Baths
##  1                   8.83 X                           
##  2                   8.81 X                      X    
##  3                   4.85 X      X               X    
##  4                   5    X      X      X        X

leaps(houseprice[, -1], Price)

## $which
##       1     2     3     4
## 1  TRUE FALSE FALSE FALSE
## 1 FALSE FALSE FALSE  TRUE
## 1 FALSE FALSE  TRUE FALSE
## 1 FALSE  TRUE FALSE FALSE
## 2  TRUE FALSE FALSE  TRUE
## 2  TRUE  TRUE FALSE FALSE
## 2  TRUE FALSE  TRUE FALSE
## 2 FALSE FALSE  TRUE  TRUE
## 2 FALSE  TRUE FALSE  TRUE
## 2 FALSE  TRUE  TRUE FALSE
## 3  TRUE  TRUE FALSE  TRUE
## 3  TRUE FALSE  TRUE  TRUE
## 3  TRUE  TRUE  TRUE FALSE
## 3 FALSE  TRUE  TRUE  TRUE
## 4  TRUE  TRUE  TRUE  TRUE
## 
## $label
## [1] "(Intercept)" "1"           "2"           "3"           "4"          
## 
## $size
##  [1] 2 2 2 2 3 3 3 3 3 3 4 4 4 4 5
## 
## $Cp
##  [1]   8.834171  92.088525 104.303380 161.057329   8.812489  10.306028
##  [7]  10.812154  66.886236  77.214388  87.881962   4.848657  10.794275
## [13]  12.289752  66.450032   5.000000

ANOVA

oneway

oneway(Length, Status)

## p value of test of equal means: p = 0.000 
## Smallest sd:  2.5    Largest sd : 3.6

summary(aov(Length~Status))

##             Df Sum Sq Mean Sq F value   Pr(>F)
## Status       2  181.4   90.69   9.319 0.000208
## Residuals   91  885.6    9.73

twoway

attach(gasoline)
twoway(MPG, Gasoline, Automobile)

##             Df Sum Sq Mean Sq F value            Pr(>F)
## x            3 25.405   8.468  90.464 0.000000000000321
## z            2  0.527   0.263   2.813            0.0799
## x:z          6  0.909   0.151   1.618            0.1854
## Residuals   24  2.247   0.094                          
##                   [,1]
## Gasoline  p =   0.0000
## Automobile  p = 0.0799
## Interaction p = 0.1854

G <- as.factor(Gasoline)
A <- as.factor(Automobile)
summary(aov(MPG ~ G * A))

##             Df Sum Sq Mean Sq F value            Pr(>F)
## G            3 25.405   8.468  90.464 0.000000000000321
## A            2  0.527   0.263   2.813            0.0799
## G:A          6  0.909   0.151   1.618            0.1854
## Residuals   24  2.247   0.094

twoway(MPG, Gasoline, Automobile, with.interaction = FALSE)

##             Df Sum Sq Mean Sq F value             Pr(>F)
## x            3 25.405   8.468  80.510 0.0000000000000189
## z            2  0.527   0.263   2.504             0.0987
## Residuals   30  3.156   0.105                           
##                   [,1]
## Gasoline  p =   0.0000
## Automobile  p = 0.0987

summary(aov(MPG ~ G + A))

##             Df Sum Sq Mean Sq F value             Pr(>F)
## G            3 25.405   8.468  80.510 0.0000000000000189
## A            2  0.527   0.263   2.504             0.0987
## Residuals   30  3.156   0.105