Using DBI with Arrow

Kirill Müller

29/09/2022

Who this tutorial is for

This tutorial is for you if you want to leverage Apache Arrow for accessing and manipulating data on databases. See vignette("DBI", package = "DBI") and vignette("DBI", package = "DBI-advanced") for tutorials on accessing data using R’s data frames instead of Arrow’s structures.

Rationale

Apache Arrow is

a cross-language development platform for in-memory analytics.

New classes and generics

Prepare

library(DBI)

con <- dbConnect(RSQLite::SQLite())

data <- data.frame(
  a = 1:3,
  b = 4.5,
  c = "five"
)

dbWriteTable(con, "tbl", data)

Read all rows from a table

dbReadTableArrow(con, "tbl")
## <nanoarrow_array_stream struct<a: int32, b: double, c: string>>
##  $ get_schema:function ()  
##  $ get_next  :function (schema = x$get_schema(), validate = TRUE)  
##  $ release   :function ()
as.data.frame(dbReadTableArrow(con, "tbl"))
##   a   b    c
## 1 1 4.5 five
## 2 2 4.5 five
## 3 3 4.5 five

Run queries

stream <- dbGetQueryArrow(con, "SELECT COUNT(*) FROM tbl WHERE a < 3")
stream
## <nanoarrow_array_stream struct<COUNT(*): int32>>
##  $ get_schema:function ()  
##  $ get_next  :function (schema = x$get_schema(), validate = TRUE)  
##  $ release   :function ()
as.data.frame(stream)
##   COUNT(*)
## 1        2

Process data piecemeal

stream <- dbGetQueryArrow(con, "SELECT * FROM tbl WHERE a < 3")
stream
## <nanoarrow_array_stream struct<a: int32, b: double, c: string>>
##  $ get_schema:function ()  
##  $ get_next  :function (schema = x$get_schema(), validate = TRUE)  
##  $ release   :function ()
stream$get_next()
## <nanoarrow_array struct[2]>
##  $ length    : int 2
##  $ null_count: int 0
##  $ offset    : int 0
##  $ buffers   :List of 1
##   ..$ :<nanoarrow_buffer validity<bool>[0][0 b]> ``
##  $ children  :List of 3
##   ..$ a:<nanoarrow_array int32[2]>
##   .. ..$ length    : int 2
##   .. ..$ null_count: int 0
##   .. ..$ offset    : int 0
##   .. ..$ buffers   :List of 2
##   .. .. ..$ :<nanoarrow_buffer validity<bool>[0][0 b]> ``
##   .. .. ..$ :<nanoarrow_buffer data<int32>[2][8 b]> `1 2`
##   .. ..$ dictionary: NULL
##   .. ..$ children  : list()
##   ..$ b:<nanoarrow_array double[2]>
##   .. ..$ length    : int 2
##   .. ..$ null_count: int 0
##   .. ..$ offset    : int 0
##   .. ..$ buffers   :List of 2
##   .. .. ..$ :<nanoarrow_buffer validity<bool>[0][0 b]> ``
##   .. .. ..$ :<nanoarrow_buffer data<double>[2][16 b]> `4.5 4.5`
##   .. ..$ dictionary: NULL
##   .. ..$ children  : list()
##   ..$ c:<nanoarrow_array string[2]>
##   .. ..$ length    : int 2
##   .. ..$ null_count: int 0
##   .. ..$ offset    : int 0
##   .. ..$ buffers   :List of 3
##   .. .. ..$ :<nanoarrow_buffer validity<bool>[0][0 b]> ``
##   .. .. ..$ :<nanoarrow_buffer data_offset<int32>[3][12 b]> `0 4 8`
##   .. .. ..$ :<nanoarrow_buffer data<string>[8 b]> `fivefive`
##   .. ..$ dictionary: NULL
##   .. ..$ children  : list()
##  $ dictionary: NULL
stream$get_next()
## NULL

Prepared queries

in_arrow <- nanoarrow::as_nanoarrow_array(data.frame(a = 1:4))
stream <- dbGetQueryArrow(con, "SELECT $a AS batch, * FROM tbl WHERE a < $a", param = in_arrow)
as.data.frame(stream)
##   batch a   b    c
## 1     2 1 4.5 five
## 2     3 1 4.5 five
## 3     3 2 4.5 five
## 4     4 1 4.5 five
## 5     4 2 4.5 five
## 6     4 3 4.5 five

Writing data

stream <- dbGetQueryArrow(con, "SELECT * FROM tbl WHERE a < 3")
dbWriteTableArrow(con, "tbl_new", stream)
dbReadTable(con, "tbl_new")
##   a   b    c
## 1 1 4.5 five
## 2 2 4.5 five

Appending data

stream <- dbGetQueryArrow(con, "SELECT * FROM tbl WHERE a < 3")
dbCreateTableArrow(con, "tbl_split", stream)
dbAppendTableArrow(con, "tbl_split", stream)
## [1] TRUE
stream <- dbGetQueryArrow(con, "SELECT * FROM tbl WHERE a >= 3")
dbAppendTableArrow(con, "tbl_split", stream)
## [1] TRUE
dbReadTable(con, "tbl_split")
##   a   b    c
## 1 1 4.5 five
## 2 2 4.5 five
## 3 3 4.5 five

As usual, do not forget to disconnect from the database when done.

dbDisconnect(con)

Conclusion

That concludes the major features of DBI. For more details on the library functions covered in this tutorial see the DBI specification at vignette("spec", package = "DBI").