In this post we are going to take a look at various data importing techniques used for spatial data analysis

Importing Data from Tables (read.table)

Accessing and importing open access environmental data is a crucial skill for data scientists. This section teaches you how to download data from the Web, import it in R and check it for consistency.

In this section, we are going to take a look at…

#Set the URL with the CSV Files
URL <- "http://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_day.csv"
#Load the CSV File
Data <- read.table(file=URL, 
                   sep=",", 
                   header=TRUE, 
                   na.string="")
#Help function
help(read.table)
#Examining the data
str(Data)
## 'data.frame':    170 obs. of  22 variables:
##  $ time           : Factor w/ 170 levels "2017-12-11T05:56:53.082Z",..: 170 169 168 167 166 165 164 163 162 161 ...
##  $ latitude       : num  33.1 64.3 59.6 37.6 32.4 ...
##  $ longitude      : num  -116 -152 -153 -119 -116 ...
##  $ depth          : num  10.95 1.4 86.5 5.97 16.22 ...
##  $ mag            : num  0.88 1.6 2.3 1.62 1.25 2.5 0.46 0.9 2.67 0.37 ...
##  $ magType        : Factor w/ 5 levels "mb","mb_lg","md",..: 4 4 4 3 4 4 4 3 3 4 ...
##  $ nst            : int  30 NA NA 26 12 NA 20 13 10 11 ...
##  $ gap            : num  64 NA NA 127 159 NA 57 195 94 93 ...
##  $ dmin           : num  0.1248 NA NA 0.0306 0.282 ...
##  $ rms            : num  0.23 0.26 0.44 0.04 0.12 0.59 0.26 0.03 0.16 0.05 ...
##  $ net            : Factor w/ 10 levels "ak","ci","hv",..: 2 1 1 5 2 1 2 5 7 2 ...
##  $ id             : Factor w/ 170 levels "ak17579412","ak17582323",..: 75 20 19 118 74 18 73 117 148 72 ...
##  $ updated        : Factor w/ 170 levels "2017-12-11T06:48:02.016Z",..: 168 167 165 166 163 162 160 169 164 157 ...
##  $ place          : Factor w/ 144 levels "105km NW of Unalakleet, Alaska",..: 34 137 102 29 77 120 139 144 124 117 ...
##  $ type           : Factor w/ 3 levels "earthquake","explosion",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ horizontalError: num  0.38 NA NA 0.32 0.35 NA 0.39 0.38 0.44 0.17 ...
##  $ depthError     : num  0.55 0.6 0.4 0.65 1.79 ...
##  $ magError       : num  0.106 NA NA 0.15 0.248 NA 0.337 0.04 0.48 0.214 ...
##  $ magNst         : int  20 NA NA 21 13 NA 14 5 8 7 ...
##  $ status         : Factor w/ 2 levels "automatic","reviewed": 1 1 1 1 1 1 1 1 2 1 ...
##  $ locationSource : Factor w/ 10 levels "ak","ci","hv",..: 2 1 1 5 2 1 2 5 7 2 ...
##  $ magSource      : Factor w/ 10 levels "ak","ci","hv",..: 2 1 1 5 2 1 2 5 7 2 ...

Downloading Open Data from FTP Sites

Often times, datasets are provided for free, but on FTP, websites and practitioners need to be able to access them. R is perfectly capable of downloading and importing data from FTP sites.

In this section, we are going to take a look at…

#Load required packages
library(RCurl)
## Loading required package: bitops
library(XML)
#Create a list with all the files on the FTP site
list <- getURL("ftp://ftp.ncdc.noaa.gov/pub/data/gsod/2016/", 
               dirlistonly = TRUE) 
#Clean the list 
FileList <- strsplit(list, split="\r\n")
#Create a new directory where to download these files
DIR <- paste(getwd(),"/NOAAFiles",sep="")
dir.create(DIR)
## Warning in dir.create(DIR): 'E:\Projects\sumendar.github.io\content\post
## \NOAAFiles' already exists
#Loop to download the files
for(FileName in unlist(FileList)){
  URL <- paste0("ftp://ftp.ncdc.noaa.gov/pub/data/gsod/2016/",FileName)
  download.file(URL, destfile=paste0(DIR,"/",FileName), method="auto", 
                mode="wb")
}
#A more elegant way
DownloadFile <- function(x){
  URL <- paste0("ftp://ftp.ncdc.noaa.gov/pub/data/gsod/2016/",x)
  download.file(URL, destfile=paste0(DIR,"/",x), method="auto", mode="wb")
}
lapply(unlist(FileList)[1:5], DownloadFile)
#Dowload a compressed file
URL <- "ftp://ftp.ncdc.noaa.gov/pub/data/gsod/2015/gsod_2015.tar"
download.file(URL, destfile=paste0(DIR,"/gsod_2015.tar"),
              method="auto",mode="wb")

untar(paste0(getwd(),"/NOAAFiles/","gsod_2015.tar"), 
      exdir=paste0(getwd(),"/NOAAFiles"))
help(unzip)
#For more information on the full experiment please visit:
#http://r-video-tutorial.blogspot.ch/2014/12/accessing-cleaning-and-plotting-noaa.html

Importing with read.lines (The Last Resort)

Some data cannot be open with neither read.table nor read.fwf
In this desperate cases in readLines can help
In this section, we are going to take a look at… *

#Download the data from the FTP site
URL <- "ftp://ftp.ncdc.noaa.gov/pub/data/noaa/2015/010231-99999-2015.gz"
FileName <- "010231-99999-2015.gz"
download.file(URL, destfile=paste0(getwd(),"/",FileName), method="auto", mode="wb")
data.strings <- readLines(gzfile(FileName, open="rt"))
## Warning in readLines(gzfile(FileName, open = "rt")): seek on a gzfile
## connection returned an internal error
head(data.strings)
## [1] "0071010231999992015010100204+64350+007800FM-15+000099999V0202201N021119999999N999999999+00801+00701999999ADDMA1100401999999REMMET044METAR ENDR 010020Z AUTO 22041KT 08/07 Q1004="
## [2] "0071010231999992015010100504+64350+007800FM-15+000099999V0202201N020619999999N999999999+00801+00701999999ADDMA1100301999999REMMET044METAR ENDR 010050Z AUTO 22040KT 08/07 Q1003="
## [3] "0071010231999992015010101204+64350+007800FM-15+000099999V0202201N020619999999N999999999+00801+00701999999ADDMA1100301999999REMMET044METAR ENDR 010120Z AUTO 22040KT 08/07 Q1003="
## [4] "0071010231999992015010101504+64350+007800FM-15+000099999V0202201N019019999999N999999999+00801+00701999999ADDMA1100301999999REMMET044METAR ENDR 010150Z AUTO 22037KT 08/07 Q1003="
## [5] "0071010231999992015010102204+64350+007800FM-15+000099999V0202201N017519999999N999999999+00801+00801999999ADDMA1100301999999REMMET044METAR ENDR 010220Z AUTO 22034KT 08/08 Q1003="
## [6] "0071010231999992015010102504+64350+007800FM-15+000099999V0202201N017519999999N999999999+00801+00701999999ADDMA1100201999999REMMET044METAR ENDR 010250Z AUTO 22034KT 08/07 Q1002="

Functions for test

Ext.Latitude <- function(x){
  substr(x, start=29, stop=34)
}

Functions for test 2

Ext.Longitude <- function(x){
  substr(x, start=35, stop=41)
}

Functions for test 2

Ext.Temp <- function(x){
  substr(x, start=88, stop=92)
}

lapply function usage

LAT <- lapply(data.strings, Ext.Latitude)
LON <- lapply(data.strings, Ext.Longitude)
TEMP <- lapply(data.strings, Ext.Temp)

Create a data.frame we can use for data analysis

DATA <- data.frame(Latitude=as.numeric(unlist(LAT))/1000,
                   Longitude=as.numeric(unlist(LON))/1000,
                   Temperature=as.numeric(unlist(TEMP))/10)

Final note code

DATA[DATA$Temperature==999.9,"Temperature"] <- NA

str(DATA)
## 'data.frame':    17291 obs. of  3 variables:
##  $ Latitude   : num  64.3 64.3 64.3 64.3 64.3 ...
##  $ Longitude  : num  7.8 7.8 7.8 7.8 7.8 7.8 7.8 7.8 7.8 7.8 ...
##  $ Temperature: num  8 8 8 8 8 8 8 8 8 8 ...
hist(DATA$Temperature, main="Temperature")