library(ggplot2) # for graphics
library(MASS) # for maximum likelihood estimation
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
z <- read.csv("hw6_dataset.csv",header=TRUE,sep=",")
z <- na.exclude(z) %>%
filter(rgr.week2 >0)
str(z)
## 'data.frame': 79 obs. of 6 variables:
## $ sheet.id : int 1 3 4 5 6 8 9 10 11 13 ...
## $ population : chr "DI" "DI" "DI" "DI" ...
## $ rgr.week1 : num 0.199 0.199 0.216 0.179 0.199 ...
## $ rgr.week2 : num 0.252 0.244 0.17 0.224 0.22 ...
## $ sumovicells : int 144 485 133 0 2 867 191 346 398 116 ...
## $ survivalweek2: int 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "na.action")= 'exclude' Named int [1:13] 2 7 12 16 23 38 40 45 71 86 ...
## ..- attr(*, "names")= chr [1:13] "2" "7" "12" "16" ...
summary(z)
## sheet.id population rgr.week1 rgr.week2
## Min. : 1.00 Length:79 Min. :0.0000 Min. :0.03188
## 1st Qu.: 26.00 Class :character 1st Qu.:0.1733 1st Qu.:0.19438
## Median : 65.00 Mode :character Median :0.1991 Median :0.22409
## Mean : 84.37 Mean :0.1818 Mean :0.20829
## 3rd Qu.:144.00 3rd Qu.:0.2162 3rd Qu.:0.24780
## Max. :213.00 Max. :0.2441 Max. :0.29098
## sumovicells survivalweek2
## Min. : 0.0 Min. :1
## 1st Qu.: 0.0 1st Qu.:1
## Median : 38.0 Median :1
## Mean : 162.9 Mean :1
## 3rd Qu.: 168.0 3rd Qu.:1
## Max. :1572.0 Max. :1
# # quick and dirty, a truncated normal distribution to work on the solution set
#
# z <- rnorm(n=3000,mean=0.2)
# z <- data.frame(1:3000,z)
# names(z) <- list("ID","myVar")
# z <- z[z$myVar>0,]
# str(z)
# summary(z$myVar)
p1 <- ggplot(data=z, aes(x=rgr.week2, y=after_stat(density))) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## â„ą Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p1 <- p1 + geom_density(linetype="dotted",size=0.75)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
normal
normPars <- fitdistr(z$rgr.week2,"normal")
print(normPars)
## mean sd
## 0.208293229 0.063896590
## (0.007188928) (0.005083340)
str(normPars)
## List of 5
## $ estimate: Named num [1:2] 0.2083 0.0639
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 0.00719 0.00508
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 5.17e-05 0.00 0.00 2.58e-05
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 79
## $ loglik : num 105
## - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
## mean
## 0.2082932
normal
prob densitymeanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]
xval <- seq(0,max(z$rgr.week2),len=length(z$rgr.week2))
stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$rgr.week2), args = list(mean = meanML, sd = sdML))
p1 + stat
## Warning: The dot-dot notation (`..y..`) was deprecated in ggplot2 3.4.0.
## â„ą Please use `after_stat(y)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
exponential
prob densityexpoPars <- fitdistr(z$rgr.week2,"exponential")
rateML <- expoPars$estimate["rate"]
stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$rgr.week2), args = list(rate=rateML))
p1 + stat + stat2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
uniform
prob densitystat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$rgr.week2), args = list(min=min(z$rgr.week2), max=max(z$rgr.week2)))
p1 + stat + stat2 + stat3
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
gamma
probability densityis.finite(z$rgr.week2)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [76] TRUE TRUE TRUE TRUE
gammaPars <- fitdistr(z$rgr.week2,"gamma")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]
stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$rgr.week2), args = list(shape=shapeML, rate=rateML))
p1 + stat + stat2 + stat3 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
beta
prob densitypSpecial <- ggplot(data=z, aes(x=rgr.week2/(max(rgr.week2 + 0.1)), y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2) +
xlim(c(0,1)) +
geom_density(size=0.75,linetype="dotted")
betaPars <- fitdistr(x=z$rgr.week2/max(z$rgr.week2 + 0.1),start=list(shape1=1,shape2=2),"beta")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]
statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$rgr.week2), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (`geom_bar()`).
Looking at the second to last graph, it seems like the gamma does fit
well, but the normal distribution looks like the closest match. It seems
to take into account the peak in the data around 0.225
.
NewData <- rnorm(n=79,mean=meanML, sd=sdML)
NewData <- data.frame(1:79,NewData)
names(NewData) <- list("ID","myVar")
NewPlot <- ggplot(data=NewData, aes(x=myVar, y=after_stat(density))) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2) +
geom_density(linetype="dotted",size=0.75)
print(NewPlot)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Old Data
ogPlot <- ggplot(data=z, aes(x=rgr.week2, y=after_stat(density))) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2) +
geom_density(linetype="dotted",size=0.75)
print(ogPlot)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The simulated model has a much more normal curve, with less tails
whereas the actual data has a long left tail. I don’t think this matches
the original data well, since the original data has a unpredictable left
tail that would not be likely to be replicated, especially with a
relatively small standard deviation of sdML = 0.0709
.