d
gives probability density functionp
gives cumulative distribution functionq
gives quantile function (the inverse of
p
)r
gives random number generationCombine these with the base name of the function. For example
rbinom
gives a set of random values drawn from a binomial,
whereas dnorm
gives the density function for a normal
distribution. There are many probability distributions available in R,
but we will discuss only 7 of them.
my_histo()
) and pdfs (my_pdf()
)############################
# Function: my_histo
# Purpose: creates a ggplot histogram
# Requires: ggplot
# Input: x = a numeric vector
# data_type= "cont" or "disc"
# Output: a ggplot histogram
############################
library(ggplot2)
my_histo <- function(x=NULL,data_type="cont") {
if(is.null(x)) x=runif(1000)
df <- data.frame(x=x)
# if data are continuous bounded (0,1), adjust bins for histogram
if (data_type=="cont" & min(df$x) > 0 & max(df$x) < 1) {
p1 <- ggplot(data=df) +
aes(x=x) +
geom_histogram(boundary=0,
binwidth=1/30,
color="black",
fill="goldenrod") +
scale_x_continuous(limits=c(0,1))}
# if data are continuous, but not bounded (0,1), use
# ggplot default bins
if (data_type=="cont" & (min(df$x) < 0 | max(df$x) > 1)) {
p1 <- ggplot(data=df) +
aes(x=x) +
geom_histogram(color="black",
fill="goldenrod")}
# if data are discrete integers,
# use geom_bar to create a histogram
if (data_type=="disc") {
p1 <- ggplot(data=df) +
aes(x=x) +
geom_bar(color="black",fill="goldenrod") }
print(p1)
}
my_histo()
my_histo(data_type="disc",x=rpois(1000,lambda=0.2))
my_histo(data_type="cont",x=runif(1000))
my_histo(data_type="cont",x=rnorm(n=1000,mean=0,sd=1))
############################
# Function: my_pdf
# Purpose: creates a ggplot probability density function
# Requires: ggplot
# Input: x = a numeric vector of x values
# y = pdf values calculated for each value of x
# data_type= "cont" or "disc"
# Output: a ggplot pdf
############################
my_pdf <- function(x=NULL,y=NULL,data_type="cont") {
if(is.null(x) | is.null(y)) {
x=seq(from=-3,to=3,length.out=100)
y=dnorm(x) }
df <- data.frame(x=x,y=y)
# for continuous distributions,
# plot the line for the pdf
if(data_type=="cont") {
p1 <- ggplot(data=df) +
aes(x=x,y=y) +
geom_line() +
geom_area(fill = "cornflower blue") }
# for discrete distributions,
# plot a bar for the probability at each value
if (data_type=="disc") {
p1 <- ggplot(data=df) +
aes(x=x,y=y) +
geom_col(color="black",fill="cornflower blue") }
print(p1)
}
my_pdf()
my_x=seq(from=0,to=1,length.out=100)
my_pdf(x=my_x,y=dbeta(x=my_x,shape1=15,shape2=10))
my_pdf(x=0:10,y=dpois(x=0:10,lambda=1.1),data_type="disc")
library(ggplot2)
library(MASS)
#-------------------------------------------------
# Poisson distribution
# Discrete X >= 0
# Random events with a constant rate lambda
# (observations per time or per unit area)
# Parameter lambda > 0
# "d" function generates probability density
hits <- 0:10
my_vec <- dpois(x=hits,lambda=1)
my_pdf(x=hits,y=my_vec,data_type="disc")
my_vec <- dpois(x=hits,lambda=2)
my_pdf(x=hits,y=my_vec,data_type="disc")
sum(my_vec) # sum of density function = 1.0 (total area under curve)
# sum is not quite 1 because we need more elements in hits:
hits <- 0:15
my_vec <- dpois(x=hits, lambda=2)
my_pdf(x=hits,y=my_vec,data_type="disc")
sum(my_vec)
# now the sum is correct
# for a Poisson distribution with lambda=2,
# what is the probability that a single draw will yield X=0?
dpois(x=0,lambda=2)
# "p" function generates cumulative probability density; gives the
# "lower tail" cumulative area of the distribution
hits <- 0:10
my_vec <- ppois(q=hits,lambda=2)
my_pdf(x=hits,y=my_vec,data_type="disc")
# for a Poisson distribution with lambda=2,
# what is the probability of getting 1 or fewer hits?
ppois(q=1, lambda=2)
# We could also get this through dpois
p_0 <- dpois(x=0,lambda=2)
p_0
p_1 <- dpois(x=1,lambda=2)
p_1
p_0 + p_1
# The q function is the inverse of p
# What is the number of hits corresponding to 50% of the probability mass
qpois(p=0.5,lambda=2.5)
my_pdf(x=0:10,y=dpois(x=0:10,lambda=2.5),data_type="disc")
# but distribution is discrete, so this is not exact
ppois(q=2,lambda=2.5)
# finally, we can simulate individual values from a poisson
ran_pois <- rpois(n=1000,lambda=2.5)
my_histo(x=ran_pois,data_type="disc")
# for real or simulated data, we can use the quantile
# function to find the empirical 95% confidence interval on the data
quantile(x=ran_pois,probs=c(0.025,0.975))
#-------------------------------------------------
# Binomial distribution
# p = probability of a dichotomous outcome
# size = number of trials
# x = possible outcomes
# outcome x is bounded between 0 and number of trials
# use "d" binom for density function
hits <- 0:10
my_vec <- dbinom(x=hits,size=10,prob=0.5)
my_pdf(x=0:10,y=my_vec,data_type="disc")
# and how does this compare to an actual simulation of 50 tosses of 100 coins?
my_coins <- rbinom(n=50,size=100,prob=0.5)
my_histo(x=my_coins,data_type="disc")
quantile(x=my_coins,probs=c(0.025,0.975))
#-------------------------------------------------
# negative binomial: number of failures (values of MyVec)
# in a series of (Bernouli) with p=probability of success
# before a target number of successes (= size)
# generates a discrete distribution that is more
# heterogeneous ("overdispersed") than Poisson
hits <- 0:40
my_vec <- dnbinom(x=hits, size=5, prob=0.5)
my_pdf(x=hits,y=my_vec,data_type="disc")
# geometric series is a special case where N= 1 success
# each bar is a constant fraction 1 - "prob" of the bar before it
my_vec <- dnbinom(x=hits, size=1, prob=0.1)
my_pdf(x=hits,y=my_vec,data_type="disc")
# alternatively specify mean = mu of distribution and size,
# the dispersion parameter (small is more dispersed)
# this gives us a poisson with a lambda value that varies
# the dispersion parameter is the shape parameter in the gamma
# as it increases, the distribution has a smaller variance
# just simulate it directly
nbi_ran <- rnbinom(n=1000,size=10,mu=5)
my_histo(x=nbi_ran,data_type="disc")
nbi_ran <- rnbinom(n=1000,size=0.1,mu=5)
my_histo(x=nbi_ran,data_type="disc")
# normal
my_norm <- rnorm(n=100,mean=100,sd=2)
my_histo(x=my_norm,data_type="cont")
# problems with normal when mean is small but zero is not allowed.
my_norm <- rnorm(n=100,mean=2,sd=2)
my_histo(x=my_norm,data_type="cont")
summary(my_norm)
toss_zeroes <- my_norm[my_norm>0]
my_histo(x=toss_zeroes,data_type="cont")
summary(toss_zeroes)
#-------------------------------------------------
# gamma distribution, continuous positive values, but bounded at 0
my_gamma <- rgamma(n=100,shape=1,scale=10)
my_histo(x=my_gamma,data_type="cont")
# gamma with shape= 1 is an exponential with scale = mean
# shape <=1 gives a mode near zero; very small shape rounds to zero
my_gamma <- rgamma(n=100,shape=0.1,scale=1)
my_histo(x=my_gamma,data_type="cont")
# large shape parameters moves towards a normal
my_gamma <- rgamma(n=100,shape=20,scale=1)
my_histo(x=my_gamma,data_type="cont")
# scale parameter changes mean- and the variance!
my_histo(x=rgamma(n=100,shape=2,scale=100),data_type="cont")
my_histo(x=rgamma(n=100,shape=2,scale=10),data_type="cont")
my_histo(x=rgamma(n=100,shape=2,scale=1),data_type="cont")
my_histo(x=rgamma(n=100,shape=2,scale=0.1),data_type="cont")
# unlike the normal, the two parameters affect both mean and variance
# mean = shape*scale
# variance= shape*scale^2
#-------------------------------------------------
# beta distribution
# bounded at 0 and 1
# analagous to a binomial, but result is a continuous distribution of probabilities
# parameter shape1 = number of successes + 1
# parameter shape2 = number of failures + 1
# interpret these in terms of a coin you are tossing
# shape1 = 1, shape2 = 1 = "no data"
my_beta <- rbeta(n=1000,shape1=1,shape2=1)
my_histo(x=my_beta,data_type="cont")
# shape1 = 2, shape1 = 1 = "1 coin toss, comes up heads!"
my_beta <- rbeta(n=1000,shape1=2,shape2=1)
my_histo(x=my_beta,data_type="cont")
# two tosses, 1 head and 1 tail
my_beta <- rbeta(n=1000,shape1=2,shape2=2)
my_histo(x=my_beta,data_type="cont")
# two tosses, both heads
my_beta <- rbeta(n=1000,shape1=2,shape2=1)
my_histo(x=my_beta,data_type="cont")
# let's get more data
my_beta <- rbeta(n=1000,shape1=20,shape2=20)
my_histo(x=my_beta,data_type="cont")
my_beta <- rbeta(n=1000,shape1=500,shape2=500)
my_histo(x=my_beta,data_type="cont")
# if the coin is biased
my_beta <- rbeta(n=1000,shape1=1000,shape2=500)
my_histo(x=my_beta,data_type="cont")
my_beta <- rbeta(n=1000,shape1=10,shape2=5)
my_histo(x=my_beta,data_type="cont")
# shape parameters less than 1.0 give us a u-shaped distribution
my_beta <- rbeta(n=1000,shape1=0.1,shape2=0.1)
my_histo(x=my_beta,data_type="cont")
my_beta <- rbeta(n=1000,shape1=0.5,shape2=0.2)
my_histo(x=my_beta,data_type="cont")
p(data|hypothesis)
is the standard approach for null
hypothesis tests.
Model paramaters can be thought of as a hypothesis about the
distribution of the data: p(data|paramaters)
This would be a goodness of fit test for data to a particular statistical distribution.
More useful might be p(paramaters|data)
. For a given set
of data, what is the probability that a particular statistical
distribution (such as a normal with a specified mean and standard
deviation), provides a fit to these data?
Clearly, we want paramaters that maximize this probability. The paramaters that fit this definition are maximum likelihood estimators.
Here is a simple example:
For this procedure:
fitdistr
to find maximum likelihood estimators for
the paramaters.Here are a few data from Lauren Ash’s dissertation: snout-vent length (SVL) measurements for 6 specimens of green frogs (Lithobates clamitans) collected from Berlin Pond in 2016.
frog_data <- c(30.15,26.3,27.5,22.9,27.8,26.2)
# get and print model parameters assuming a normal distribution
z<- fitdistr(frog_data,"normal")
print(z)
# plot the density function for the normal and annotate the plot with the original data
x <- 1:100
g_density <- dnorm(x=x,mean=z$estimate["mean"],sd=z$estimate["sd"])
qplot(x,g_density,geom="line") + annotate(geom="point",x=frog_data,y=0.001)
# now do the same for a gamma distribution, which has a shape and rate parameter as outputs from fitdistr
z<- fitdistr(frog_data,"gamma")
print(z)
# plot the density function for the gamma and annotate the plot with the original data
x <- 1:100
g_density <- dgamma(x=x,shape=z$estimate["shape"],rate=z$estimate["rate"])
qplot(x,g_density,geom="line") + annotate(geom="point",x=frog_data,y=0.001,color="red")
There is not much difference here.But now let’s add a weird outlier of a tiny value and see what happens
frog_data <- c(30.15,26.3,27.5,22.9,27.8,26.2)
outlier <- 0.050
frog_data <- c(frog_data,outlier)
# get and print model parameters assuming a normal distribution
z<- fitdistr(frog_data,"normal")
print(z)
# plot the density function for the normal and annotate the plot with the original data
x <- 1:100
g_density <- dnorm(x=x,mean=z$estimate["mean"],sd=z$estimate["sd"])
qplot(x,g_density,geom="line") + annotate(geom="point",x=frog_data,y=0.001)
# now do the same for a gamma distribution, which has a shape and rate parameter as outputs from fitdistr
z<- fitdistr(frog_data,"gamma")
print(z)
# plot the density function for the gamma and annotate the plot with the original data
x <- 1:100
g_density <- dgamma(x=x,shape=z$estimate["shape"],rate=z$estimate["rate"])
qplot(x,g_density,geom="line") + annotate(geom="point",x=frog_data,y=0.001,color="red")