#SQL stands for Structured Query Language, and it’s useful for storing and processing datasets. This is often used in a lot of other applications, for example I use it a lot when running GIS applications like ArcGIS Pro or ArcMaps, which usually doesn’t have a dedicated way to interact with datasets except with SQL.

#We’re going to be using the site_by_species.csv and site_by_variables.csv. The first has every row being a site, and the presence/absence of each species as a separate column. The second has every row being a site, and the columns being various environmental conditions/predictors.

#Installing Packages
library(sqldf)

## Warning: package 'sqldf' was built under R version 4.4.2

## Warning: package 'gsubfn' was built under R version 4.4.2

## Warning: package 'proto' was built under R version 4.4.2

## Warning: package 'RSQLite' was built under R version 4.4.2

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.4.1

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.4.1

## Warning: package 'ggplot2' was built under R version 4.4.1

## Warning: package 'tibble' was built under R version 4.4.1

## Warning: package 'tidyr' was built under R version 4.4.1

## Warning: package 'readr' was built under R version 4.4.1

## Warning: package 'purrr' was built under R version 4.4.1

## Warning: package 'stringr' was built under R version 4.4.1

## Warning: package 'forcats' was built under R version 4.4.1

## Warning: package 'lubridate' was built under R version 4.4.1

Take a look at the datasets first

head(species_clean)

head(var_clean)

Start with operations using 1 file

How to subset rows

dplyr Method:

#Dplyr method-uses the filter() method
species<-filter(species_clean, Site<30)
var<-filter(var_clean, Site<30)

SQL Method

#SQL Method
query="
SELECT Site, Sp1, Sp2, Sp3 
FROM species
WHERE Site<'30'
"
sqldf(query)

##    Site Sp1 Sp2 Sp3
## 1     1   0   0   0
## 2     2   0   0   0
## 3     3   0   0   0
## 4     4   0   0   0
## 5     5   0   0   0
## 6     6   0   0   0
## 7     8   0   0   0
## 8     9   0   0   0
## 9    10   0   0   0
## 10   11   0   0   0
## 11   12   0   0   0
## 12   13   0   0   0
## 13   15   0   1   0
## 14   16   0   0   0
## 15   17   0   0   0
## 16   18   0   0   0
## 17   19   0   0   0
## 18   20   0   0   0
## 19   21   0   0   0
## 20   23   0   0   0
## 21   24   0   0   0
## 22   25   0   0   0
## 23   26   0   0   0
## 24   28   0   0   0

How to subset columns

Dplyr method-uses select() function, which can use either column # or name

dplyr method

edit_species<-species%>%
  select(Site, Sp1, Sp2, Sp3)
edit_species_2<-species%>% #Using column #s doesn't require any indicators
  select(1, 2, 3, 4)

SQL Method

#SQL method-
# query entire table
query ="
SELECT *
FROM species
"
a=sqldf(query) # save results to data frame

sqldf(query) # dump to console

Specify columns

query ="
SELECT Site, Sp1, Sp2, Sp3
FROM species
"
sqldf(query)

##    Site Sp1 Sp2 Sp3
## 1     1   0   0   0
## 2     2   0   0   0
## 3     3   0   0   0
## 4     4   0   0   0
## 5     5   0   0   0
## 6     6   0   0   0
## 7     8   0   0   0
## 8     9   0   0   0
## 9    10   0   0   0
## 10   11   0   0   0
## 11   12   0   0   0
## 12   13   0   0   0
## 13   15   0   1   0
## 14   16   0   0   0
## 15   17   0   0   0
## 16   18   0   0   0
## 17   19   0   0   0
## 18   20   0   0   0
## 19   21   0   0   0
## 20   23   0   0   0
## 21   24   0   0   0
## 22   25   0   0   0
## 23   26   0   0   0
## 24   28   0   0   0

# reorder columns
query ="
SELECT Sp1, Sp2, Sp3, Site
FROM species
"
sqldf(query)

##    Sp1 Sp2 Sp3 Site
## 1    0   0   0    1
## 2    0   0   0    2
## 3    0   0   0    3
## 4    0   0   0    4
## 5    0   0   0    5
## 6    0   0   0    6
## 7    0   0   0    8
## 8    0   0   0    9
## 9    0   0   0   10
## 10   0   0   0   11
## 11   0   0   0   12
## 12   0   0   0   13
## 13   0   1   0   15
## 14   0   0   0   16
## 15   0   0   0   17
## 16   0   0   0   18
## 17   0   0   0   19
## 18   0   0   0   20
## 19   0   0   0   21
## 20   0   0   0   23
## 21   0   0   0   24
## 22   0   0   0   25
## 23   0   0   0   26
## 24   0   0   0   28

If we want to rename columns, we can just use the rename() function

Dplyr method

species<-rename(species, Long=Longitude.x., Lat=Latitude.y.)

Pull out all the numerical columns

dplyr method

num_species<-species%>%
  mutate(letters=rep(letters, length.out=length(species$Site)))
num_species<-select(num_species, Site, Long, Lat, Sp1,letters)
num_species_edit<-select(num_species, where(is.numeric))

Pivot_Longer

Pivot_longer lengthens the data, decreasing the number of columns, and increases the number of rows. Can use either pivot_longer or gather, but gather is outdated

species_long<-pivot_longer(edit_species, cols=c(Sp1, Sp2, Sp3), names_to="ID")

Pivot_wide

Pivot_wide goes from long to wide, widens the data, increasing the number of columns, decreases the number of rows. Can use pivot_wider or spread

species_wide<-pivot_wider(species_long, names_from=ID)

Aggregating SQL Method

SQL Method

aggregate and give counts of ObjectType

query="
SELECT SUM(Sp1+Sp2+Sp3)
FROM species_wide
GROUP BY SITE
"
sqldf(query)

##    SUM(Sp1+Sp2+Sp3)
## 1                 0
## 2                 0
## 3                 0
## 4                 0
## 5                 0
## 6                 0
## 7                 0
## 8                 0
## 9                 0
## 10                0
## 11                0
## 12                0
## 13                1
## 14                0
## 15                0
## 16                0
## 17                0
## 18                0
## 19                0
## 20                0
## 21                0
## 22                0
## 23                0
## 24                0

Aggregate and give counts with new variable name

query="
SELECT SUM(Sp1+Sp2+Sp3) AS Occurence
FROM species_wide
GROUP BY SITE
"
sqldf(query)

##    Occurence
## 1          0
## 2          0
## 3          0
## 4          0
## 5          0
## 6          0
## 7          0
## 8          0
## 9          0
## 10         0
## 11         0
## 12         0
## 13         1
## 14         0
## 15         0
## 16         0
## 17         0
## 18         0
## 19         0
## 20         0
## 21         0
## 22         0
## 23         0
## 24         0

Mutating/Adding columns Dplyr method-uses mutate function()-we’ve already covered this!

SQL Method

#SQL method
query="
ALTER TABLE species_wide
ADD new_column VARCHAR
"
sqldf(query)

## Warning in result_fetch(res@ptr, n = n): `dbGetQuery()`, `dbSendQuery()` and
## `dbFetch()` should only be used with `SELECT` queries. Did you mean
## `dbExecute()`, `dbSendStatement()` or `dbGetRowsAffected()`?

## data frame with 0 columns and 0 rows

Re-ordering columns

Dplyr method-using the select() method

reorder<-select(species, Sp1, Sp2, Site)

SQL Method reorder columns

query ="
SELECT Sp1, Sp2, Sp3, Site
FROM species
"
sqldf(query)

##    Sp1 Sp2 Sp3 Site
## 1    0   0   0    1
## 2    0   0   0    2
## 3    0   0   0    3
## 4    0   0   0    4
## 5    0   0   0    5
## 6    0   0   0    6
## 7    0   0   0    8
## 8    0   0   0    9
## 9    0   0   0   10
## 10   0   0   0   11
## 11   0   0   0   12
## 12   0   0   0   13
## 13   0   1   0   15
## 14   0   0   0   16
## 15   0   0   0   17
## 16   0   0   0   18
## 17   0   0   0   19
## 18   0   0   0   20
## 19   0   0   0   21
## 20   0   0   0   23
## 21   0   0   0   24
## 22   0   0   0   25
## 23   0   0   0   26
## 24   0   0   0   28

2 File Operations

Here, we are going to interact with 2 data files (.csv’s), and try to gather them into a usable form. Often times people will store different variables/types of data into different files, just to maintain organization-sometimes this is good, but others, we may need want to combine them, in which case we “join” them together.

Joins are going to be different from binding-joining requires the data files to have at least one similar column so that it knows how to combine them, and will order it accordingly,while binding just immediately pastes the rows/columns together, and by position, not by anything meaningful. So, use those with that warning-there is a high chance for error when using bind functions, as any minor error in that could throw off the entire dataset.

Left/Right/Union Joins

Let’s first start with clean files-Reset the species and var variables, and let’s filter them to a manageable size.

edit_species<-species_clean%>%
  filter(Site<30)%>%
  select(Site, Sp1, Sp2, Sp3, Sp4, Longitude.x., Latitude.y.)

edit_var<-var_clean%>%
  filter(Site<30)%>%
    select(Site, Longitude.x., Latitude.y., BIO1_Annual_mean_temperature, BIO12_Annual_precipitation)

#Dplyr method
#Left_join-a left join basically means you are stitching the matching rows of file B to file A-this does require that there be some matching/marker column to actually link. 
left<-left_join(edit_species, edit_var, by="Site")
head(left)

##   Site Sp1 Sp2 Sp3 Sp4 Longitude.x..x Latitude.y..x Longitude.x..y
## 1    1   0   0   0   0         29.375       -22.125          29.38
## 2    2   0   0   0   0         30.875       -22.125          30.88
## 3    3   0   0   0   1         29.125       -22.375          29.13
## 4    4   0   0   0   0         30.125       -22.375          30.13
## 5    5   0   0   0   0         30.375       -22.375          30.38
## 6    6   0   0   0   0         30.625       -22.375          30.63
##   Latitude.y..y BIO1_Annual_mean_temperature BIO12_Annual_precipitation
## 1        -22.13                        22.68                     354.00
## 2        -22.13                        23.44                     487.50
## 3        -22.38                        21.91                     350.75
## 4        -22.38                        22.83                     448.00
## 5        -22.38                        23.07                     434.00
## 6        -22.38                        23.12                     466.75

#Right_join-does the opposite-joins the matching rows of file A to B. The difference here is what is lost when you match them together
right<-right_join(edit_species, edit_var, by="Site")
head(right)

##   Site Sp1 Sp2 Sp3 Sp4 Longitude.x..x Latitude.y..x Longitude.x..y
## 1    1   0   0   0   0         29.375       -22.125          29.38
## 2    2   0   0   0   0         30.875       -22.125          30.88
## 3    3   0   0   0   1         29.125       -22.375          29.13
## 4    4   0   0   0   0         30.125       -22.375          30.13
## 5    5   0   0   0   0         30.375       -22.375          30.38
## 6    6   0   0   0   0         30.625       -22.375          30.63
##   Latitude.y..y BIO1_Annual_mean_temperature BIO12_Annual_precipitation
## 1        -22.13                        22.68                     354.00
## 2        -22.13                        23.44                     487.50
## 3        -22.38                        21.91                     350.75
## 4        -22.38                        22.83                     448.00
## 5        -22.38                        23.07                     434.00
## 6        -22.38                        23.12                     466.75

#Inner_join-retains rows that match between both files A and B. This one loses a lot of information if they aren't matching very well.
inner<-inner_join(edit_species, edit_var, by="Site")

#Full_join-the opposite of an inner join, just retains all values, all rows, so instead of losing a lot of rows, you have a trade-off where you instead get a bunch of NA's when they're missing.
full<-full_join(edit_species, edit_var, by="Site")

SQL method

query="
SELECT *
FROM edit_var
INNER JOIN edit_species ON edit_var.Site = edit_species.Site
"
x<-sqldf(query)

# ------------------------------
# left join keeps all rows from the first table and adds
# data from second table for matches and NAs for mismatches (n = 500 rows = row number in Orders table)
query="
SELECT *
FROM edit_var
LEFT JOIN edit_species ON edit_var.Site = edit_species.Site
"
x<-sqldf(query)

dplyr_SQL_Lecture

George Ni

2023-02-27

Take a look at the datasets first

Start with operations using 1 file

How to subset rows

How to subset columns

Dplyr method-uses select() function, which can use either column # or name

Specify columns

Pull out all the numerical columns

Pivot_Longer

Pivot_wide

SQL Method

aggregate and give counts of ObjectType

Re-ordering columns

Dplyr method-using the select() method

2 File Operations

Left/Right/Union Joins