In this post we are going to take a look at various data importing techniques used for spatial data analysis
Accessing and importing open access environmental data is a crucial skill for data scientists. This section teaches you how to download data from the Web, import it in R and check it for consistency.
In this section, we are going to take a look at…
#Set the URL with the CSV Files
URL <- "http://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_day.csv"
#Load the CSV File
Data <- read.table(file=URL,
sep=",",
header=TRUE,
na.string="")
#Help function
help(read.table)
#Examining the data
str(Data)
## 'data.frame': 170 obs. of 22 variables:
## $ time : Factor w/ 170 levels "2017-12-11T05:56:53.082Z",..: 170 169 168 167 166 165 164 163 162 161 ...
## $ latitude : num 33.1 64.3 59.6 37.6 32.4 ...
## $ longitude : num -116 -152 -153 -119 -116 ...
## $ depth : num 10.95 1.4 86.5 5.97 16.22 ...
## $ mag : num 0.88 1.6 2.3 1.62 1.25 2.5 0.46 0.9 2.67 0.37 ...
## $ magType : Factor w/ 5 levels "mb","mb_lg","md",..: 4 4 4 3 4 4 4 3 3 4 ...
## $ nst : int 30 NA NA 26 12 NA 20 13 10 11 ...
## $ gap : num 64 NA NA 127 159 NA 57 195 94 93 ...
## $ dmin : num 0.1248 NA NA 0.0306 0.282 ...
## $ rms : num 0.23 0.26 0.44 0.04 0.12 0.59 0.26 0.03 0.16 0.05 ...
## $ net : Factor w/ 10 levels "ak","ci","hv",..: 2 1 1 5 2 1 2 5 7 2 ...
## $ id : Factor w/ 170 levels "ak17579412","ak17582323",..: 75 20 19 118 74 18 73 117 148 72 ...
## $ updated : Factor w/ 170 levels "2017-12-11T06:48:02.016Z",..: 168 167 165 166 163 162 160 169 164 157 ...
## $ place : Factor w/ 144 levels "105km NW of Unalakleet, Alaska",..: 34 137 102 29 77 120 139 144 124 117 ...
## $ type : Factor w/ 3 levels "earthquake","explosion",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ horizontalError: num 0.38 NA NA 0.32 0.35 NA 0.39 0.38 0.44 0.17 ...
## $ depthError : num 0.55 0.6 0.4 0.65 1.79 ...
## $ magError : num 0.106 NA NA 0.15 0.248 NA 0.337 0.04 0.48 0.214 ...
## $ magNst : int 20 NA NA 21 13 NA 14 5 8 7 ...
## $ status : Factor w/ 2 levels "automatic","reviewed": 1 1 1 1 1 1 1 1 2 1 ...
## $ locationSource : Factor w/ 10 levels "ak","ci","hv",..: 2 1 1 5 2 1 2 5 7 2 ...
## $ magSource : Factor w/ 10 levels "ak","ci","hv",..: 2 1 1 5 2 1 2 5 7 2 ...
Often times, datasets are provided for free, but on FTP, websites and practitioners need to be able to access them. R is perfectly capable of downloading and importing data from FTP sites.
In this section, we are going to take a look at…
#Load required packages
library(RCurl)
## Loading required package: bitops
library(XML)
#Create a list with all the files on the FTP site
list <- getURL("ftp://ftp.ncdc.noaa.gov/pub/data/gsod/2016/",
dirlistonly = TRUE)
#Clean the list
FileList <- strsplit(list, split="\r\n")
#Create a new directory where to download these files
DIR <- paste(getwd(),"/NOAAFiles",sep="")
dir.create(DIR)
## Warning in dir.create(DIR): 'E:\Projects\sumendar.github.io\content\post
## \NOAAFiles' already exists
#Loop to download the files
for(FileName in unlist(FileList)){
URL <- paste0("ftp://ftp.ncdc.noaa.gov/pub/data/gsod/2016/",FileName)
download.file(URL, destfile=paste0(DIR,"/",FileName), method="auto",
mode="wb")
}
#A more elegant way
DownloadFile <- function(x){
URL <- paste0("ftp://ftp.ncdc.noaa.gov/pub/data/gsod/2016/",x)
download.file(URL, destfile=paste0(DIR,"/",x), method="auto", mode="wb")
}
lapply(unlist(FileList)[1:5], DownloadFile)
#Dowload a compressed file
URL <- "ftp://ftp.ncdc.noaa.gov/pub/data/gsod/2015/gsod_2015.tar"
download.file(URL, destfile=paste0(DIR,"/gsod_2015.tar"),
method="auto",mode="wb")
untar(paste0(getwd(),"/NOAAFiles/","gsod_2015.tar"),
exdir=paste0(getwd(),"/NOAAFiles"))
help(unzip)
#For more information on the full experiment please visit:
#http://r-video-tutorial.blogspot.ch/2014/12/accessing-cleaning-and-plotting-noaa.html
Some data cannot be open with neither read.table
nor read.fwf
In this desperate cases in readLines
can help
In this section, we are going to take a look at… *
#Download the data from the FTP site
URL <- "ftp://ftp.ncdc.noaa.gov/pub/data/noaa/2015/010231-99999-2015.gz"
FileName <- "010231-99999-2015.gz"
download.file(URL, destfile=paste0(getwd(),"/",FileName), method="auto", mode="wb")
data.strings <- readLines(gzfile(FileName, open="rt"))
## Warning in readLines(gzfile(FileName, open = "rt")): seek on a gzfile
## connection returned an internal error
head(data.strings)
## [1] "0071010231999992015010100204+64350+007800FM-15+000099999V0202201N021119999999N999999999+00801+00701999999ADDMA1100401999999REMMET044METAR ENDR 010020Z AUTO 22041KT 08/07 Q1004="
## [2] "0071010231999992015010100504+64350+007800FM-15+000099999V0202201N020619999999N999999999+00801+00701999999ADDMA1100301999999REMMET044METAR ENDR 010050Z AUTO 22040KT 08/07 Q1003="
## [3] "0071010231999992015010101204+64350+007800FM-15+000099999V0202201N020619999999N999999999+00801+00701999999ADDMA1100301999999REMMET044METAR ENDR 010120Z AUTO 22040KT 08/07 Q1003="
## [4] "0071010231999992015010101504+64350+007800FM-15+000099999V0202201N019019999999N999999999+00801+00701999999ADDMA1100301999999REMMET044METAR ENDR 010150Z AUTO 22037KT 08/07 Q1003="
## [5] "0071010231999992015010102204+64350+007800FM-15+000099999V0202201N017519999999N999999999+00801+00801999999ADDMA1100301999999REMMET044METAR ENDR 010220Z AUTO 22034KT 08/08 Q1003="
## [6] "0071010231999992015010102504+64350+007800FM-15+000099999V0202201N017519999999N999999999+00801+00701999999ADDMA1100201999999REMMET044METAR ENDR 010250Z AUTO 22034KT 08/07 Q1002="
Ext.Latitude <- function(x){
substr(x, start=29, stop=34)
}
Ext.Longitude <- function(x){
substr(x, start=35, stop=41)
}
Ext.Temp <- function(x){
substr(x, start=88, stop=92)
}
LAT <- lapply(data.strings, Ext.Latitude)
LON <- lapply(data.strings, Ext.Longitude)
TEMP <- lapply(data.strings, Ext.Temp)
DATA <- data.frame(Latitude=as.numeric(unlist(LAT))/1000,
Longitude=as.numeric(unlist(LON))/1000,
Temperature=as.numeric(unlist(TEMP))/10)
DATA[DATA$Temperature==999.9,"Temperature"] <- NA
str(DATA)
## 'data.frame': 17291 obs. of 3 variables:
## $ Latitude : num 64.3 64.3 64.3 64.3 64.3 ...
## $ Longitude : num 7.8 7.8 7.8 7.8 7.8 7.8 7.8 7.8 7.8 7.8 ...
## $ Temperature: num 8 8 8 8 8 8 8 8 8 8 ...
hist(DATA$Temperature, main="Temperature")