Urban Data Science — Ex 3: R

Jianghao Wang

2018-04-26

1 引言

介绍如何利用R进行空间数据分析。

同样地,本练习将以空气质量数据获取分析为例,介绍R在空间数据分析中的应用。

2 空气污染制图

初始化

library(tidyverse)
## -- Attaching packages ---------------------------------------------------------------------------- tidyverse 1.2.1 --
## √ ggplot2 2.2.1     √ purrr   0.2.4
## √ tibble  1.4.2     √ dplyr   0.7.4
## √ tidyr   0.8.0     √ stringr 1.2.0
## √ readr   1.1.1     √ forcats 0.3.0
## -- Conflicts ------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(sf)
## Linking to GEOS 3.6.1, GDAL 2.2.3, proj.4 4.9.3
library(ggmap)
library(automap)
## Loading required package: sp

2.1 读取和预览数据

df <- read_csv("aqi.csv")
## Parsed with column specification:
## cols(
##   lat = col_double(),
##   lon = col_double(),
##   city = col_character(),
##   idx = col_integer(),
##   stamp = col_integer(),
##   pol = col_character(),
##   x = col_character(),
##   aqi = col_character(),
##   tz = col_character(),
##   utime = col_datetime(format = ""),
##   img = col_character()
## )
head(df)
## # A tibble: 6 x 11
##     lat   lon city                      idx  stamp pol   x     aqi   tz   
##   <dbl> <dbl> <chr>                   <int>  <int> <chr> <chr> <chr> <chr>
## 1  25.7 100.  Dali City Environmenta~  1387 1.52e9 pm25  7223  89    +0800
## 2  20.0  79.3 Chandrapur, Chandrapur~  2813 1.52e9 pm25  8682  80    +0530
## 3  35.2 128.  Sangdae-dong, Jinju-si~  1778 1.52e9 pm25  4015  78    +0900
## 4  47.4 124.  Anju Compound, Qiqihae~  1144 1.52e9 pm25  3631  61    +0800
## 5  39.5  76.0 Patrol unit, Kashi (喀什~  1377 1.52e9 pm25  7239  57    +0800
## 6  28.1 120.  青田二中, Qīngtián, Lishui~   915 1.52e9 pm25  4830  155   +0800
## # ... with 2 more variables: utime <dttm>, img <chr>
df$aqi <- as.numeric(df$aqi)
hist(df$aqi, breaks = 30)

ggplot(df, aes(x = aqi)) + 
  geom_histogram() +
  geom_density()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

2.2 清洗数据

利用dplyrtidyr来进行数据的清晰,并学习使用管道 %>%

df <- df %>% 
  filter(!is.na(aqi)) %>%
  select(-img) %>%
  select(lon, lat, aqi) %>%
  mutate(aqi = as.numeric(aqi))

ggplot(df, aes(x = lon, y = lat)) +
    geom_point(aes(colour = aqi))

library(RColorBrewer)
display.brewer.all()

2.3 空间可视化

sp <- df
coordinates(sp) <- ~ lon + lat
proj4string(sp) <- CRS("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
str(sp)
## Formal class 'SpatialPointsDataFrame' [package "sp"] with 5 slots
##   ..@ data       :Classes 'tbl_df', 'tbl' and 'data.frame':  448 obs. of  1 variable:
##   .. ..$ aqi: num [1:448] 89 80 78 61 57 155 154 147 53 50 ...
##   ..@ coords.nrs : int [1:2] 1 2
##   ..@ coords     : num [1:448, 1:2] 100.2 79.3 128.1 123.9 76 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:448] "1" "2" "3" "4" ...
##   .. .. ..$ : chr [1:2] "lon" "lat"
##   ..@ bbox       : num [1:2, 1:2] 72.88 7.88 139.22 50.43
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:2] "lon" "lat"
##   .. .. ..$ : chr [1:2] "min" "max"
##   ..@ proj4string:Formal class 'CRS' [package "sp"] with 1 slot
##   .. .. ..@ projargs: chr "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs +towgs84=0,0,0"
spplot(sp)

sf <- st_as_sf(sp)
plot(sf, graticule = TRUE, axes = TRUE)

terrain <-
  get_map(
    location = c(lon = 105, lat = 39),
    zoom = 4,
    maptype = "terrain",
    source = "google",
    messaging = FALSE
  )
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=39,105&zoom=4&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
colours = c( "#009966", "#ffde33", "#ff9933", "#cc0033", "#660099", "#7e0023")
ggmap(terrain) + 
  geom_point(data = df, aes(x = lon, y = lat, colour = aqi)) +
  scale_colour_gradientn(colours = colours)

library(leaflet)
pal <- colorQuantile("YlOrRd", NULL, n=5)
leaflet(df) %>% addTiles()%>% 
  addCircleMarkers(~lon, ~lat , popup=df$aqi,
                   radius = log2(df$aqi),
                   stroke = FALSE, fillOpacity = 0.8,
                   color= ~pal(df$aqi)) 

2.4 空间插值

library(raster)
## 
## Attaching package: 'raster'
## The following object is masked from 'package:dplyr':
## 
##     select
## The following object is masked from 'package:tidyr':
## 
##     extract
library(rasterVis)
## Loading required package: lattice
## Loading required package: latticeExtra
## 
## Attaching package: 'latticeExtra'
## The following object is masked from 'package:ggplot2':
## 
##     layer
library(rgdal)
## rgdal: version: 1.2-18, (SVN revision 718)
##  Geospatial Data Abstraction Library extensions to R successfully loaded
##  Loaded GDAL runtime: GDAL 2.2.3, released 2017/11/20
##  Path to GDAL shared files: C:/Users/hello/Documents/R/win-library/3.4/rgdal/gdal
##  GDAL binary built with GEOS: TRUE 
##  Loaded PROJ.4 runtime: Rel. 4.9.3, 15 August 2016, [PJ_VERSION: 493]
##  Path to PROJ.4 shared files: C:/Users/hello/Documents/R/win-library/3.4/rgdal/proj
##  Linking to sp version: 1.2-7
r <- raster("pop_10km.tif")
r
## class       : RasterLayer 
## dimensions  : 406, 484, 196504  (nrow, ncol, ncell)
## resolution  : 10000, 10000  (x, y)
## extent      : -2630000, 2210000, 1870000, 5930000  (xmin, xmax, ymin, ymax)
## coord. ref. : +proj=aea +lat_1=25 +lat_2=47 +lat_0=0 +lon_0=105 +x_0=0 +y_0=0 +ellps=krass +units=m +no_defs 
## data source : D:\Course\ucasgisapp\pop_10km.tif 
## names       : pop_10km 
## values      : 0, 1852488  (min, max)
plot(r)

levelplot(r %>% log())

grid <- as(r, "SpatialGridDataFrame")
obs <- spTransform(x = sp, CRSobj = grid@proj4string)

library(automap)
pm25 <- autoKrige(aqi ~ 1, obs, grid, 
                  remove_duplicates = T, debug.level = 0)
pm25.cv <- autoKrige.cv(aqi ~ 1, obs, nfold = 10)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=======                                                          |  11%
  |                                                                       
  |==============                                                   |  22%
  |                                                                       
  |======================                                           |  33%
  |                                                                       
  |=============================                                    |  44%
  |                                                                       
  |====================================                             |  56%
  |                                                                       
  |===========================================                      |  67%
  |                                                                       
  |===================================================              |  78%
  |                                                                       
  |==========================================================       |  89%
  |                                                                       
  |=================================================================| 100%
ggplot(as.data.frame(pm25.cv$krige.cv_output), aes(x = var1.pred, y = observed)) + 
  geom_point() + 
  geom_smooth(method = lm) +
  geom_abline(intercept = 0, slope = 1, colour = "red") +
  xlim(0, 300) +
  ylim(0, 300)

lm(pm25.cv$krige.cv_output$var1.pred ~ pm25.cv$krige.cv_output$observed - 1) %>% summary()
## 
## Call:
## lm(formula = pm25.cv$krige.cv_output$var1.pred ~ pm25.cv$krige.cv_output$observed - 
##     1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -523.17   14.91   30.48   45.32  265.83 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## pm25.cv$krige.cv_output$observed  0.73374    0.02084   35.21   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 62.56 on 447 degrees of freedom
## Multiple R-squared:  0.735,  Adjusted R-squared:  0.7344 
## F-statistic:  1240 on 1 and 447 DF,  p-value: < 2.2e-16
plot(pm25)