Vapour - lightweight GDAL

Michael D. Sumner

2018-08-09

There’s a function vapour_read_attributes that returns the attributes as list of vectors.

pfile <- system.file("extdata", "point.shp", package = "vapour")
library(vapour)
vapour_read_attributes(pfile)
#> $a
#>  [1]  1  2  3  4  5  6  7  8  9 10

A higher level function somewhere else could wrap that function to return a data frame, but we don’t want that in vapour because it’s not aligned with the goals of being lightweight and reducing the level of interpretation applied. The data.frame function in R is actually a very primitive implememtation for data frames, so we avoid putting that interpretation on the data and leave that up to the developer / user.

mvfile <- system.file("extdata", "tab", "list_locality_postcode_meander_valley.tab", package="vapour")

dat <- as.data.frame(vapour_read_attributes(mvfile),  stringsAsFactors = FALSE)

dim(dat)
#> [1] 58 11

head(dat)
#>   LOCAL_ID        NAME POSTCODE PLAN_REF   GAZ_DATE NOM_REG_NO
#> 1   100422    Caveside     7304  CPR5322 1970-01-01       947L
#> 2   100366     Weegena     7304  CPR5327 1970-01-01      1300M
#> 3   100337   Kimberley     7304  CPR5361 1970-01-01      1063T
#> 4   100308     Parkham     7304  CPR5327 1970-01-01      1179Y
#> 5   100263   Frankford     7275  CPR5142 1970-01-01      1003Q
#> 6   100311 Bridgenorth     7277  CPR5140 1970-01-01       925X
#>                                      UFI          CREATED_ON
#> 1 {4a5db4da-ca19-41a0-8dd4-c28a14bbee18} 2016-03-04 10:42:37
#> 2 {253b676e-2791-469c-ac5e-9cb3a95cc158} 2015-06-19 13:46:50
#> 3 {75f60a99-4c58-4d3e-911d-bbaa9a04164c} 2016-09-16 10:54:56
#> 4 {b008d456-4e80-4237-80f6-a26c03817e3c} 2014-06-06 16:50:22
#> 5 {953f4006-6397-4d03-af97-507eab170862} 2014-12-08 09:07:12
#> 6 {5cf8e2a3-631c-4d52-a79c-0ce475f76848} 2015-05-11 10:49:09
#>                                LIST_GUID SHAPE_AREA SHAPE_LEN
#> 1 {839edd46-01a7-4a45-9d97-499962fa952b}      -9999  39785.88
#> 2 {de35ebd4-0ac0-4299-947d-87d07c69426a}      -9999  31587.54
#> 3 {73ced9ad-ee9a-41d5-a5bc-c95c4ab948d9}      -9999  35693.32
#> 4 {37f17d1f-d2a0-4b78-ba0d-e5f62f216658}      -9999  67614.51
#> 5 {47f3a313-913f-4f83-8dc1-021907f9ee80}      -9999  70140.78
#> 6 {425d01cc-223b-4348-b965-384a98fd7999}      -9999  38156.70

A low-level function will return a character vector of JSON, GML, KML or WKT.

vapour_read_geometry(pfile)[5:6]  ## format = "WKB"
#> [[1]]
#>  [1] 01 01 00 00 00 00 00 60 08 18 ad ec 3f 00 00 e0 9a ec 77 e2 3f
#> 
#> [[2]]
#>  [1] 01 01 00 00 00 00 00 c0 40 3c bb d0 3f 00 00 80 0e 30 25 d5 3f

vapour_read_geometry_text(pfile)[5:6]  ## format = "json"
#> [[1]]
#> [1] "{ \"type\": \"Point\", \"coordinates\": [ 0.89612962375395, 0.577139189234003 ] }"
#> 
#> [[2]]
#> [1] "{ \"type\": \"Point\", \"coordinates\": [ 0.261427939636633, 0.330394758377224 ] }"

vapour_read_geometry_text(pfile, textformat = "gml")[2]
#> [[1]]
#> [1] "<gml:Point><gml:coordinates>0.145755324047059,0.395469118840992</gml:coordinates></gml:Point>"

## don't do this with a non-longlat data set like cfile
vapour_read_geometry_text(pfile, textformat = "kml")[1:2]
#> [[1]]
#> [1] "<Point><coordinates>0.623376188334078,0.380098037654534</coordinates></Point>"
#> 
#> [[2]]
#> [1] "<Point><coordinates>0.145755324047059,0.395469118840992</coordinates></Point>"

cfile <- system.file("extdata/sst_c.gpkg", package = "vapour")
str(vapour_read_geometry_text(cfile, textformat = "wkt")[1:2])
#> List of 2
#>  $ : chr "MULTILINESTRING ((-16254.4210476553 -3269904.98849485,-48956.5880244328 -3282652.40200143,-82133.8545994558 -33"| __truncated__
#>  $ : chr "MULTILINESTRING ((-18812.5003359828 -3784514.28779524,-40153.6937313319 -3789439.97415405,-37747.2881609 -38569"| __truncated__

Combine these together to get a custom data set.

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
dat <- as.data.frame(vapour_read_attributes(cfile),  stringsAsFactors = FALSE) %>%
    dplyr::mutate(wkt = vapour_read_geometry_text(cfile, textformat = "wkt"))
glimpse(dat)
#> Observations: 7
#> Variables: 3
#> $ level <chr> "275", "280", "285", "290", "295", "300", "305"
#> $ sst   <dbl> 1.85, 6.85, 11.85, 16.85, 21.85, 26.85, 31.85
#> $ wkt   <list> ["MULTILINESTRING ((-16254.4210476553 -3269904.98849485...

There is a function vapour_read_extent to return a straightforward bounding box vector for every feature, so that we can flexibly build an index of a data set for later use.

mvfile <- system.file("extdata", "tab", "list_locality_postcode_meander_valley.tab", package="vapour")
str(head(vapour_read_extent(mvfile)))
#> List of 6
#>  $ : num [1:4] 448353 457706 5386606 5397352
#>  $ : num [1:4] 453544 459318 5403972 5412505
#>  $ : num [1:4] 454840 461042 5411562 5417892
#>  $ : num [1:4] 461505 476213 5410911 5424854
#>  $ : num [1:4] 471573 483157 5417110 5424645
#>  $ : num [1:4] 491638 494048 5417262 5419331

This makes for a very lightweight summary data set that will scale to hundreds of large inputs.

Raster data

Find raster info.

f <- system.file("extdata", "sst.tif", package = "vapour")
vapour_raster_info(f)
#> $geotransform
#> [1] 140.00000000   0.07000000   0.00000000 -39.99722207   0.00000000
#> [6]  -0.07000389
#> 
#> $dimXY
#> [1] 143 286
#> 
#> $minmax
#> [1] 271.350 289.859
#> 
#> $tilesXY
#> [1] 143  14
#> 
#> $projection
#> [1] "GEOGCS[\"WGS 84\",DATUM[\"WGS_1984\",SPHEROID[\"WGS 84\",6378137,298.257223563,AUTHORITY[\"EPSG\",\"7030\"]],AUTHORITY[\"EPSG\",\"6326\"]],PRIMEM[\"Greenwich\",0],UNIT[\"degree\",0.0174532925199433],AUTHORITY[\"EPSG\",\"4326\"]]"
#> 
#> $bands
#> [1] 1

SQL is available for general GDAL vector data.

OGRSQL

Note that each lower-level function accepts a sql argument, which sends a query to the GDAL library to be executed against the data source, this can create custom layers and so is independent of and ignores the layer argument. Note that the same sql statement can be passed to the geometry readers, so we get the matching sets of information. vapour_read_geometry will return NULL for each missing geometry if the statement doesn’t include geometry explicitly or implicitly, but vapour_read_geometry, vapour_read_geometry_text and vapour_read_extent all explicitly modify the statement “SELECT *“. (We are also assuming the data source hasn’t changed between accesses … let me know if this causes you problems!).

vapour_read_attributes(mvfile, sql = "SELECT NAME, PLAN_REF FROM list_locality_postcode_meander_valley WHERE POSTCODE = 7310")
#> $NAME
#> character(0)
#> 
#> $PLAN_REF
#> character(0)


vapour_read_attributes(mvfile, sql = "SELECT NAME, PLAN_REF, FID FROM list_locality_postcode_meander_valley WHERE POSTCODE = 7306")
#> $NAME
#> [1] "Cradle Mountain" "Mount Roland"    "Middlesex"       "Lower Beulah"   
#> 
#> $PLAN_REF
#> [1] "CPR5363" "CPR5359" "CPR5362" "CPR5361"
#> 
#> $FID
#> [1] 20 31 36 45

Also note that FID is a special row number value, to be used a as general facility for selecting by structural row. This FID is driver-dependent, it can be 0- or 1-based, or completely arbitrary.

Variously, drivers (GDAL’s formats) are 0- or 1- based with the FID. Others (such as OSM) are arbitrary, and have non-sequential (and presumably persientent) FID values.

library(vapour)
file0 <- "list_locality_postcode_meander_valley.tab"
mvfile <- system.file("extdata/tab", file0, package="vapour")
layer <- gsub(".tab$", "", basename(mvfile))
## get the number of features by FID (DISTINCT should be redundant here)
vapour_read_attributes(mvfile, sql = sprintf("SELECT COUNT(DISTINCT FID) AS nfeatures FROM %s", layer))
#> $nfeatures
#> [1] 58

## note how TAB is 1-based 
vapour_read_attributes(mvfile, sql = sprintf("SELECT COUNT(*) AS n FROM %s WHERE FID < 2", layer))
#> $n
#> [1] 1

## but SHP is 0-based
shp <- system.file("extdata/point.shp", package="vapour")
vapour_read_attributes(shp, sql = sprintf("SELECT COUNT(*) AS n FROM %s WHERE FID < 2", "point"))
#> $n
#> [1] 2

See http://www.gdal.org/ogr_sql.html

There are many useful higher level operations that can be used with this. The simplest is the ability to use GDAL as a database-like connection to attribute tables.