d gives probability density functionp gives cumulative distribution functionq gives quantile function (the inverse of
p)r gives random number generationCombine these with the base name of the function. For example
rbinom gives a set of random values drawn from a binomial,
whereas dnorm gives the density function for a normal
distribution. There are many probability distributions available in R,
but we will discuss only 7 of them.
my_histo()) and pdfs (my_pdf())############################
# Function: my_histo
# Purpose: creates a ggplot histogram
# Requires: ggplot
# Input: x = a numeric vector
# data_type= "cont" or "disc"
# Output: a ggplot histogram
############################
library(ggplot2)
my_histo <- function(x=NULL,data_type="cont") {
if(is.null(x)) x=runif(1000)
df <- data.frame(x=x)
# if data are continuous bounded (0,1), adjust bins for histogram
if (data_type=="cont" & min(df$x) > 0 & max(df$x) < 1) {
p1 <- ggplot(data=df) +
aes(x=x) +
geom_histogram(boundary=0,
binwidth=1/30,
color="black",
fill="goldenrod") +
scale_x_continuous(limits=c(0,1))}
# if data are continuous, but not bounded (0,1), use
# ggplot default bins
if (data_type=="cont" & (min(df$x) < 0 | max(df$x) > 1)) {
p1 <- ggplot(data=df) +
aes(x=x) +
geom_histogram(color="black",
fill="goldenrod")}
# if data are discrete integers,
# use geom_bar to create a histogram
if (data_type=="disc") {
p1 <- ggplot(data=df) +
aes(x=x) +
geom_bar(color="black",fill="goldenrod") }
print(p1)
}
my_histo()
my_histo(data_type="disc",x=rpois(1000,lambda=0.2))
my_histo(data_type="cont",x=runif(1000))
my_histo(data_type="cont",x=rnorm(n=1000,mean=0,sd=1))
############################
# Function: my_pdf
# Purpose: creates a ggplot probability density function
# Requires: ggplot
# Input: x = a numeric vector of x values
# y = pdf values calculated for each value of x
# data_type= "cont" or "disc"
# Output: a ggplot pdf
############################
my_pdf <- function(x=NULL,y=NULL,data_type="cont") {
if(is.null(x) | is.null(y)) {
x=seq(from=-3,to=3,length.out=100)
y=dnorm(x) }
df <- data.frame(x=x,y=y)
# for continuous distributions,
# plot the line for the pdf
if(data_type=="cont") {
p1 <- ggplot(data=df) +
aes(x=x,y=y) +
geom_line() +
geom_area(fill = "cornflower blue") }
# for discrete distributions,
# plot a bar for the probability at each value
if (data_type=="disc") {
p1 <- ggplot(data=df) +
aes(x=x,y=y) +
geom_col(color="black",fill="cornflower blue") }
print(p1)
}
my_pdf()
my_x=seq(from=0,to=1,length.out=100)
my_pdf(x=my_x,y=dbeta(x=my_x,shape1=15,shape2=10))
my_pdf(x=0:10,y=dpois(x=0:10,lambda=1.1),data_type="disc")library(ggplot2)
library(MASS)
#-------------------------------------------------
# Poisson distribution
# Discrete X >= 0
# Random events with a constant rate lambda
# (observations per time or per unit area)
# Parameter lambda > 0
# "d" function generates probability density
hits <- 0:10
my_vec <- dpois(x=hits,lambda=1)
my_pdf(x=hits,y=my_vec,data_type="disc")
my_vec <- dpois(x=hits,lambda=2)
my_pdf(x=hits,y=my_vec,data_type="disc")
sum(my_vec) # sum of density function = 1.0 (total area under curve)
# sum is not quite 1 because we need more elements in hits:
hits <- 0:15
my_vec <- dpois(x=hits, lambda=2)
my_pdf(x=hits,y=my_vec,data_type="disc")
sum(my_vec)
# now the sum is correct
# for a Poisson distribution with lambda=2,
# what is the probability that a single draw will yield X=0?
dpois(x=0,lambda=2)
# "p" function generates cumulative probability density; gives the
# "lower tail" cumulative area of the distribution
hits <- 0:10
my_vec <- ppois(q=hits,lambda=2)
my_pdf(x=hits,y=my_vec,data_type="disc")
# for a Poisson distribution with lambda=2,
# what is the probability of getting 1 or fewer hits?
ppois(q=1, lambda=2)
# We could also get this through dpois
p_0 <- dpois(x=0,lambda=2)
p_0
p_1 <- dpois(x=1,lambda=2)
p_1
p_0 + p_1
# The q function is the inverse of p
# What is the number of hits corresponding to 50% of the probability mass
qpois(p=0.5,lambda=2.5)
my_pdf(x=0:10,y=dpois(x=0:10,lambda=2.5),data_type="disc")
# but distribution is discrete, so this is not exact
ppois(q=2,lambda=2.5)
# finally, we can simulate individual values from a poisson
ran_pois <- rpois(n=1000,lambda=2.5)
my_histo(x=ran_pois,data_type="disc")
# for real or simulated data, we can use the quantile
# function to find the empirical 95% confidence interval on the data
quantile(x=ran_pois,probs=c(0.025,0.975))#-------------------------------------------------
# Binomial distribution
# p = probability of a dichotomous outcome
# size = number of trials
# x = possible outcomes
# outcome x is bounded between 0 and number of trials
# use "d" binom for density function
hits <- 0:10
my_vec <- dbinom(x=hits,size=10,prob=0.5)
my_pdf(x=0:10,y=my_vec,data_type="disc")
# and how does this compare to an actual simulation of 50 tosses of 100 coins?
my_coins <- rbinom(n=50,size=100,prob=0.5)
my_histo(x=my_coins,data_type="disc")
quantile(x=my_coins,probs=c(0.025,0.975))#-------------------------------------------------
# negative binomial: number of failures (values of MyVec)
# in a series of (Bernouli) with p=probability of success
# before a target number of successes (= size)
# generates a discrete distribution that is more
# heterogeneous ("overdispersed") than Poisson
hits <- 0:40
my_vec <- dnbinom(x=hits, size=5, prob=0.5)
my_pdf(x=hits,y=my_vec,data_type="disc")
# geometric series is a special case where N= 1 success
# each bar is a constant fraction 1 - "prob" of the bar before it
my_vec <- dnbinom(x=hits, size=1, prob=0.1)
my_pdf(x=hits,y=my_vec,data_type="disc")
# alternatively specify mean = mu of distribution and size,
# the dispersion parameter (small is more dispersed)
# this gives us a poisson with a lambda value that varies
# the dispersion parameter is the shape parameter in the gamma
# as it increases, the distribution has a smaller variance
# just simulate it directly
nbi_ran <- rnbinom(n=1000,size=10,mu=5)
my_histo(x=nbi_ran,data_type="disc")
nbi_ran <- rnbinom(n=1000,size=0.1,mu=5)
my_histo(x=nbi_ran,data_type="disc")# normal
my_norm <- rnorm(n=100,mean=100,sd=2)
my_histo(x=my_norm,data_type="cont")
# problems with normal when mean is small but zero is not allowed.
my_norm <- rnorm(n=100,mean=2,sd=2)
my_histo(x=my_norm,data_type="cont")
summary(my_norm)
toss_zeroes <- my_norm[my_norm>0]
my_histo(x=toss_zeroes,data_type="cont")
summary(toss_zeroes)#-------------------------------------------------
# gamma distribution, continuous positive values, but bounded at 0
my_gamma <- rgamma(n=100,shape=1,scale=10)
my_histo(x=my_gamma,data_type="cont")
# gamma with shape= 1 is an exponential with scale = mean
# shape <=1 gives a mode near zero; very small shape rounds to zero
my_gamma <- rgamma(n=100,shape=0.1,scale=1)
my_histo(x=my_gamma,data_type="cont")
# large shape parameters moves towards a normal
my_gamma <- rgamma(n=100,shape=20,scale=1)
my_histo(x=my_gamma,data_type="cont")
# scale parameter changes mean- and the variance!
my_histo(x=rgamma(n=100,shape=2,scale=100),data_type="cont")
my_histo(x=rgamma(n=100,shape=2,scale=10),data_type="cont")
my_histo(x=rgamma(n=100,shape=2,scale=1),data_type="cont")
my_histo(x=rgamma(n=100,shape=2,scale=0.1),data_type="cont")
# unlike the normal, the two parameters affect both mean and variance
# mean = shape*scale
# variance= shape*scale^2#-------------------------------------------------
# beta distribution
# bounded at 0 and 1
# analagous to a binomial, but result is a continuous distribution of probabilities
# parameter shape1 = number of successes + 1
# parameter shape2 = number of failures + 1
# interpret these in terms of a coin you are tossing
# shape1 = 1, shape2 = 1 = "no data"
my_beta <- rbeta(n=1000,shape1=1,shape2=1)
my_histo(x=my_beta,data_type="cont")
# shape1 = 2, shape1 = 1 = "1 coin toss, comes up heads!"
my_beta <- rbeta(n=1000,shape1=2,shape2=1)
my_histo(x=my_beta,data_type="cont")
# two tosses, 1 head and 1 tail
my_beta <- rbeta(n=1000,shape1=2,shape2=2)
my_histo(x=my_beta,data_type="cont")
# two tosses, both heads
my_beta <- rbeta(n=1000,shape1=2,shape2=1)
my_histo(x=my_beta,data_type="cont")
# let's get more data
my_beta <- rbeta(n=1000,shape1=20,shape2=20)
my_histo(x=my_beta,data_type="cont")
my_beta <- rbeta(n=1000,shape1=500,shape2=500)
my_histo(x=my_beta,data_type="cont")
# if the coin is biased
my_beta <- rbeta(n=1000,shape1=1000,shape2=500)
my_histo(x=my_beta,data_type="cont")
my_beta <- rbeta(n=1000,shape1=10,shape2=5)
my_histo(x=my_beta,data_type="cont")
# shape parameters less than 1.0 give us a u-shaped distribution
my_beta <- rbeta(n=1000,shape1=0.1,shape2=0.1)
my_histo(x=my_beta,data_type="cont")
my_beta <- rbeta(n=1000,shape1=0.5,shape2=0.2)
my_histo(x=my_beta,data_type="cont")p(data|hypothesis) is the standard approach for null
hypothesis tests.
Model paramaters can be thought of as a hypothesis about the
distribution of the data: p(data|paramaters)
This would be a goodness of fit test for data to a particular statistical distribution.
More useful might be p(paramaters|data). For a given set
of data, what is the probability that a particular statistical
distribution (such as a normal with a specified mean and standard
deviation), provides a fit to these data?
Clearly, we want paramaters that maximize this probability. The paramaters that fit this definition are maximum likelihood estimators.
Here is a simple example:
For this procedure:
fitdistr to find maximum likelihood estimators for
the paramaters.Here are a few data from Lauren Ash’s dissertation: snout-vent length (SVL) measurements for 6 specimens of green frogs (Lithobates clamitans) collected from Berlin Pond in 2016.
frog_data <- c(30.15,26.3,27.5,22.9,27.8,26.2)
# get and print model parameters assuming a normal distribution
z<- fitdistr(frog_data,"normal")
print(z)
# plot the density function for the normal and annotate the plot with the original data
x <- 1:100
g_density <- dnorm(x=x,mean=z$estimate["mean"],sd=z$estimate["sd"])
qplot(x,g_density,geom="line") + annotate(geom="point",x=frog_data,y=0.001)
# now do the same for a gamma distribution, which has a shape and rate parameter as outputs from fitdistr
z<- fitdistr(frog_data,"gamma")
print(z)
# plot the density function for the gamma and annotate the plot with the original data
x <- 1:100
g_density <- dgamma(x=x,shape=z$estimate["shape"],rate=z$estimate["rate"])
qplot(x,g_density,geom="line") + annotate(geom="point",x=frog_data,y=0.001,color="red") There is not much difference here.But now let’s add a weird outlier of a tiny value and see what happens
frog_data <- c(30.15,26.3,27.5,22.9,27.8,26.2)
outlier <- 0.050
frog_data <- c(frog_data,outlier)
# get and print model parameters assuming a normal distribution
z<- fitdistr(frog_data,"normal")
print(z)
# plot the density function for the normal and annotate the plot with the original data
x <- 1:100
g_density <- dnorm(x=x,mean=z$estimate["mean"],sd=z$estimate["sd"])
qplot(x,g_density,geom="line") + annotate(geom="point",x=frog_data,y=0.001)
# now do the same for a gamma distribution, which has a shape and rate parameter as outputs from fitdistr
z<- fitdistr(frog_data,"gamma")
print(z)
# plot the density function for the gamma and annotate the plot with the original data
x <- 1:100
g_density <- dgamma(x=x,shape=z$estimate["shape"],rate=z$estimate["rate"])
qplot(x,g_density,geom="line") + annotate(geom="point",x=frog_data,y=0.001,color="red")