library(survival)
survival <-read.csv(file ="Graveyard_Survival.csv")
attach(survival)
survival$Death_Age <- as.numeric(Death_Year-Birth_Year)
survival$Event<-as.logical(rep("TRUE",length(survival$Death_Age)))
survival$Century<-as.factor(ifelse(survival$Birth_Year<1900,"19th","20th"))

# Plotting survival curves by sex
sfit2<-survfit(Surv(Death_Age, Event)~Sex, data=survival)
summary(sfit2, times=seq(0,100,10))
## Call: survfit(formula = Surv(Death_Age, Event) ~ Sex, data = survival)
## 
##                 Sex=F 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     0    262       0   1.0000 0.00000      1.00000       1.0000
##    10    233      34   0.8702 0.02076      0.83047       0.9119
##    20    208      23   0.7824 0.02549      0.73405       0.8340
##    30    177      31   0.6641 0.02918      0.60933       0.7238
##    40    155      20   0.5878 0.03041      0.53111       0.6505
##    50    134      25   0.4924 0.03089      0.43540       0.5568
##    60    105      26   0.3931 0.03018      0.33822       0.4570
##    70     78      26   0.2939 0.02814      0.24360       0.3546
##    80     45      32   0.1718 0.02330      0.13165       0.2241
##    90      7      40   0.0191 0.00845      0.00801       0.0455
## 
##                 Sex=M 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     0    288       0  1.00000 0.00000      1.00000       1.0000
##    10    257      33  0.88542 0.01877      0.84938       0.9230
##    20    230      28  0.78819 0.02408      0.74239       0.8368
##    30    200      31  0.68056 0.02747      0.62878       0.7366
##    40    157      44  0.52778 0.02942      0.47316       0.5887
##    50    128      26  0.43750 0.02923      0.38380       0.4987
##    60    101      29  0.33681 0.02785      0.28642       0.3961
##    70     53      46  0.17708 0.02249      0.13806       0.2271
##    80     21      32  0.06597 0.01463      0.04272       0.1019
##    90      3      17  0.00694 0.00489      0.00175       0.0276
plot(sfit2) # pretty bad

#devtools::install_github("kassambara/survminer", build_vignettes = FALSE)
#devtools::install_github("kassambara/ggpubr")
library(ggpubr)
## Loading required package: ggplot2
## Loading required package: magrittr
library(survminer)

# ggplot survival curves: not as curvy as how we calculated
# ggsurvplot(sfit2)
ggsurvplot(sfit2, conf.int=TRUE, pval=TRUE, risk.table=FALSE, legend.title="Sex",
           legend.labs=c("Female", "Male"),
           palette=c("orchid2","dodgerblue2"), 
           main="Kaplan-Meier Curve for Survival between sexes")

# Plotting survival curves by century
sfit3<-survfit(Surv(Death_Age, Event)~Century, data=survival)
summary(sfit3, times=seq(0,100,10))
## Call: survfit(formula = Surv(Death_Age, Event) ~ Century, data = survival)
## 
##                 Century=19th 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     0    293       0  1.00000 0.00000      1.00000       1.0000
##    10    258      41  0.86007 0.02027      0.82125       0.9007
##    20    224      32  0.75085 0.02527      0.70293       0.8020
##    30    177      49  0.58362 0.02880      0.52982       0.6429
##    40    135      38  0.45392 0.02909      0.40035       0.5147
##    50    113      23  0.37543 0.02829      0.32388       0.4352
##    60     92      22  0.30034 0.02678      0.25218       0.3577
##    70     59      31  0.19454 0.02313      0.15411       0.2456
##    80     28      29  0.09556 0.01718      0.06719       0.1359
##    90      4      26  0.00683 0.00481      0.00172       0.0272
## 
##                 Century=20th 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     0    257       0   1.0000 0.00000      1.00000       1.0000
##    10    232      26   0.8988 0.01881      0.86271       0.9365
##    20    214      19   0.8249 0.02371      0.77972       0.8727
##    30    200      13   0.7743 0.02608      0.72486       0.8272
##    40    177      26   0.6732 0.02926      0.61818       0.7330
##    50    149      28   0.5642 0.03093      0.50672       0.6282
##    60    114      33   0.4358 0.03093      0.37920       0.5008
##    70     72      41   0.2763 0.02789      0.22667       0.3367
##    80     38      35   0.1401 0.02165      0.10347       0.1896
##    90      6      31   0.0195 0.00862      0.00817       0.0463
plot(sfit3) # pretty crappy

ggsurvplot(sfit3, conf.int=TRUE, pval=TRUE, risk.table=FALSE, legend.title="Century",
           legend.labs=c("19th Century", "20th Century"),
           palette=c("darkslategrey","blue"), 
           main="Kaplan-Meier Curve for Survival between centuries")

sfit4<-survfit(Surv(Death_Age, Event)~Century+Sex, data=survival)
ggsurvplot(sfit4)

## Cox PH test: Cox regression (or proportional hazards regression) allows analyzing the effect of several risk factors on survival

# Hazard Ratio (HR) is exp(coef)
#HR=1: No effect
#HR>1: Increase in hazard
#HR<1: Reduction in hazard (protective)
diff_Sex <- coxph(Surv(Death_Age, Event)~Sex, data=survival)
diff_Sex
## Call:
## coxph(formula = Surv(Death_Age, Event) ~ Sex, data = survival)
## 
##        coef exp(coef) se(coef)    z      p
## SexM 0.2562    1.2921   0.0867 2.96 0.0031
## 
## Likelihood ratio test=8.76  on 1 df, p=0.00308
## n= 550, number of events= 550
## since exp(coef)>1, males have increase in hazard

diff_Century <- coxph(Surv(Death_Age, Event)~Century, data=survival)
diff_Century
## Call:
## coxph(formula = Surv(Death_Age, Event) ~ Century, data = survival)
## 
##               coef exp(coef) se(coef)     z       p
## Century20th -0.323     0.724    0.086 -3.75 0.00017
## 
## Likelihood ratio test=14.1  on 1 df, p=0.00017
## n= 550, number of events= 550
## since exp(coef)<1, 20th century have reduction in hazard (hazard = probablity of death)

diff_Sex_Century <- coxph(Surv(Death_Age, Event)~Sex+Century, data=survival)
diff_Sex_Century
## Call:
## coxph(formula = Surv(Death_Age, Event) ~ Sex + Century, data = survival)
## 
##                coef exp(coef) se(coef)     z       p
## SexM         0.2364    1.2667   0.0867  2.73 0.00640
## Century20th -0.3081    0.7348   0.0862 -3.58 0.00035
## 
## Likelihood ratio test=21.6  on 2 df, p=2.05e-05
## n= 550, number of events= 550
## both are significant

## logrank test: log-rank test is useful for comparing survival curves in two or more groups 
#using survdiff in survival package
survdiff(Surv(Death_Age, Event)~Sex, data=survival)
## Call:
## survdiff(formula = Surv(Death_Age, Event) ~ Sex, data = survival)
## 
##         N Observed Expected (O-E)^2/E (O-E)^2/V
## Sex=F 262      262      296      3.83      8.87
## Sex=M 288      288      254      4.45      8.87
## 
##  Chisq= 8.9  on 1 degrees of freedom, p= 0.0029
survdiff(Surv(Death_Age, Event)~Century, data=survival)
## Call:
## survdiff(formula = Surv(Death_Age, Event) ~ Century, data = survival)
## 
##                N Observed Expected (O-E)^2/E (O-E)^2/V
## Century=19th 293      293      250      7.50      14.5
## Century=20th 257      257      300      6.24      14.5
## 
##  Chisq= 14.5  on 1 degrees of freedom, p= 0.000138
survdiff(Surv(Death_Age, Event)~Century+Sex, data=survival)
## Call:
## survdiff(formula = Surv(Death_Age, Event) ~ Century + Sex, data = survival)
## 
##                       N Observed Expected (O-E)^2/E (O-E)^2/V
## Century=19th, Sex=F 140      140      125     1.883     2.556
## Century=19th, Sex=M 153      153      125     6.255     8.510
## Century=20th, Sex=F 122      122      171    14.017    22.142
## Century=20th, Sex=M 135      135      129     0.248     0.343
## 
##  Chisq= 24.3  on 3 degrees of freedom, p= 2.19e-05