noctua 2.6.1.9000

Feature:

Bug Fix:

Internals:

noctua 2.6.1

Bug Fix:

noctua 2.6.0

Feature:

noctua 2.5.1

Bug Fix:

noctua 2.5.0

Feature:

noctua 2.4.0

Feature:

Bug Fix:

noctua 2.3.0

Feature:

import awswrangler as wr

import getpass
bucket = getpass.getpass()
path = f"s3://{bucket}/data/"


if "awswrangler_test" not in wr.catalog.databases().values:
    wr.catalog.create_database("awswrangler_test")

cols = ["id", "dt", "element", "value", "m_flag", "q_flag", "s_flag", "obs_time"]

df = wr.s3.read_csv(
    path="s3://noaa-ghcn-pds/csv/189",
    names=cols,
    parse_dates=["dt", "obs_time"])  # Read 10 files from the 1890 decade (~1GB)

wr.s3.to_parquet(
    df=df,
    path=path,
    dataset=True,
    mode="overwrite",
    database="awswrangler_test",
    table="noaa"
);

wr.catalog.table(database="awswrangler_test", table="noaa")
library(DBI)

con <- dbConnect(noctua::athena())

# Query ran using CSV output
system.time({
  df = dbGetQuery(con, "SELECT * FROM awswrangler_test.noaa")
})
# Info: (Data scanned: 80.88 MB)
#    user  system elapsed
#  57.004   8.430 160.567 

noctua::noctua_options(cache_size = 1)

# Query ran using UNLOAD Parquet output
system.time({
  df = dbGetQuery(con, "SELECT * FROM awswrangler_test.noaa", unload = T)
})
# Info: (Data scanned: 80.88 MB)
#    user  system elapsed 
#  21.622   2.350  39.232 

# Query ran using cache
system.time({
  df = dbGetQuery(con, "SELECT * FROM awswrangler_test.noaa", unload = T)
})
# Info: (Data scanned: 80.88 MB)
#    user  system elapsed 
#  13.738   1.886  11.029 

noctua 2.2.0

Bug Fix:

Feature:

library(data.table)
library(DBI)

x = 5

dt = data.table(
  var1 = sample(LETTERS, size = x, T),
  var2 = rep(list(list("var3"= 1:3, "var4" = list("var5"= letters[1:5]))), x)
)

con <- dbConnect(noctua::athena())

#> Version: 2.2.0

sqlData(con, dt)

# Registered S3 method overwritten by 'jsonify':
#   method     from    
#   print.json jsonlite
# Info: Special characters "\t" has been converted to " " to help with Athena reading file format tsv
#    var1                                                   var2
# 1:    1 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}
# 2:    2 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}
# 3:    3 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}
# 4:    4 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}
# 5:    5 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}

#> Version: 2.1.0

sqlData(con, dt)

# Info: Special characters "\t" has been converted to " " to help with Athena reading file format tsv
#    var1                                        var2
# 1:    1 1:3|list(var5 = c("a", "b", "c", "d", "e"))
# 2:    2 1:3|list(var5 = c("a", "b", "c", "d", "e"))
# 3:    3 1:3|list(var5 = c("a", "b", "c", "d", "e"))
# 4:    4 1:3|list(var5 = c("a", "b", "c", "d", "e"))
# 5:    5 1:3|list(var5 = c("a", "b", "c", "d", "e"))

v-2.2.0 now converts lists into json lines format so that AWS Athena can parse with sql array/mapping/json functions. Small down side a s3 method conflict occurs when jsonify is called to convert lists into json lines. jsonify was choose in favor to jsonlite due to the performance improvements (#156).

noctua 2.1.0

Bug Fix:

Feature:

noctua 2.0.1

Bug Fix:

noctua 2.0.0

API Change

By utilising environments for AthenaConnection and AthenaResult, all AthenaResult classes created from AthenaConnection will point to the same ptr and info environments for it’s connection. Previously ptr and info would make a copy. This means if it was modified it would not affect the child or parent class for example:

# Old Method
library(DBI)
con <- dbConnect(noctua::athena(),
                 rstudio_conn_tab = F)

res <- dbExecute(con, "select 'helloworld'")

# modifying parent class to influence child
con@info$made_up <- "helloworld"

# nothing happened
res@connection@info$made_up
# > NULL

# modifying child class to influence parent
res@connection@info$made_up <- "oh no!"

# nothing happened
con@info$made_up
# > "helloworld"

# New Method
library(DBI)
con <- dbConnect(noctua::athena(),
                 rstudio_conn_tab = F)

res <- dbExecute(con, "select 'helloworld'")

# modifying parent class to influence child
con@info$made_up <- "helloworld"

# picked up change
res@connection@info$made_up
# > "helloworld"

# modifying child class to influence parent
res@connection@info$made_up <- "oh no!"

# picked up change
con@info$made_up
# > "oh no!"

New Feature

library(DBI)
library(noctua)

# default conversion methods
con <- dbConnect(noctua::athena())

# change json conversion method
noctua_options(json = "character")
noctua:::athena_option_env$json
# [1] "character"

# change json conversion to custom method
noctua_options(json = jsonify::from_json)
noctua:::athena_option_env$json
# function (json, simplify = TRUE, fill_na = FALSE, buffer_size = 1024) 
# {
#   json_to_r(json, simplify, fill_na, buffer_size)
# }
# <bytecode: 0x7f823b9f6830>
#   <environment: namespace:jsonify>

# change bigint conversion without affecting custom json conversion methods
noctua_options(bigint = "numeric")
noctua:::athena_option_env$json
# function (json, simplify = TRUE, fill_na = FALSE, buffer_size = 1024) 
# {
#   json_to_r(json, simplify, fill_na, buffer_size)
# }
# <bytecode: 0x7f823b9f6830>
#   <environment: namespace:jsonify>

noctua:::athena_option_env$bigint
# [1] "numeric"

# change binary conversion without affect, bigint or json methods
noctua_options(binary = "character")
noctua:::athena_option_env$json
# function (json, simplify = TRUE, fill_na = FALSE, buffer_size = 1024) 
# {
#   json_to_r(json, simplify, fill_na, buffer_size)
# }
# <bytecode: 0x7f823b9f6830>
#   <environment: namespace:jsonify>

noctua:::athena_option_env$bigint
# [1] "numeric"

noctua:::athena_option_env$binary
# [1] "character"

# no conversion for json objects
con2 <- dbConnect(noctua::athena(), json = "character")

# use custom json parser
con <- dbConnect(noctua::athena(), json = jsonify::from_json)

Bug Fix

noctua 1.10.0

New Feature

library(DBI)
library(noctua)
con <- dbConnect(athena())
dbGetPartition(con, "test_df2", .format = T)
# Info: (Data scanned: 0 Bytes)
#    year month day
# 1: 2020    11  17
dbGetPartition(con, "test_df2")
# Info: (Data scanned: 0 Bytes)
#                    partition
# 1: year=2020/month=11/day=17
library(DBI)
con <- dbConnect(noctua::athena(), bigint = "numeric")

When switching between the different file parsers the bigint to be represented according to the file parser i.e. data.table: “integer64” -> vroom: “I”.

Bug Fix

Documentation

noctua 1.9.1

Note

Error: write_parquet requires the arrow package, please install it first and try again

Bug Fix

Error in seq.default(1, length(l), 1000) : wrong sign in 'by' argument

Now a warning message will be returned:

Warning message:
Failed to remove AWS S3 files from: "s3://{bucket}/{prefix}/". Please check if AWS S3 files exist.

noctua 1.9.0

Minor Change

library(DBI)
library(data.table)

X <- 1010
value <- data.table(x = 1:X,
                    y = sample(letters, X, replace = T), 
                    z = sample(c(TRUE, FALSE), X, replace = T))

con <- dbConnect(noctua::athena())

# create a removable table with 1010 parquet files in AWS S3.
dbWriteTable(con, "rm_tbl", value, file.type = "parquet", overwrite = T, max.batch = 1)

# old method: delete_object
system.time({dbRemoveTable(con, "rm_tbl", confirm = T)})
# user  system elapsed 
# 31.004   8.152 115.906 

# new method: delete_objects
system.time({dbRemoveTable(con, "rm_tbl", confirm = T)})
# user  system elapsed 
# 17.319   0.370  22.709 

New Feature

Bug Fix

Info: The S3 objects in prefix will be deleted:
  s3://bucket/path/schema/table
Info: The S3 objects in prefix will be deleted:
  s3://bucket/path/schema/table

To overcome this dbRemoveTable will opt for paws::s3()$list_objects_v2 instead of paws::s3()$list_objects when listing s3 objects to be deleted. This allows noctua to iterate over AWS s3 prefix using tokens, instead of deleting objects in chunks. * s3_upload_location simplified how s3 location is built. Now s3.location parameter isn’t affected and instead only additional components e.g. name, schema and partition. * dbplyr v-2.0.0 function in_schema now wraps strings in quotes, this breaks db_query_fields.AthenaConnection. Now db_query_fields.AthenaConnection removes any quotation from the string so that it can search AWS GLUE for table metadata. (#117)

noctua 1.8.1

Bug Fix

noctua 1.8.0

New Feature

# Stop AWS Athena when R has been interrupted:

con <- dbConnect(noctua::athena())

# Let AWS Athena keep running when R has been interrupted:

con <- dbConnect(noctua::athena(),
                 keyboard_interrupt = F)

noctua 1.7.1

Minor Change

Documentation:

noctua 1.7.0

New Feature

library(DBI)
con <- dbConnect(noctua::athena())
res <- dbExecute(con, "select * from some_big_table limit 10000")
dbFetch(res, 5000)

Bug Fix

Documentation

noctua 1.6.0

New Feature

library(DBI)

con = dbConnect(noctua::athena())

# upload iris dataframe for removal test
dbWriteTable(con, "iris2", iris)

# Athena method
system.time(dbRemoveTable(con, "iris2", confirm = T))
# user  system elapsed 
# 0.247   0.091   2.243 

# upload iris dataframe for removal test
dbWriteTable(con, "iris2", iris)

# Glue method
system.time(dbRemoveTable(con, "iris2", confirm = T))
# user  system elapsed 
# 0.110   0.045   1.094 
library(DBI)
con = dbConnect(noctua::athena())
dbWriteTable(con, "iris2", iris, file.type = "json")
dbGetQuery(con, "select * from iris2")

Bug Fix

Documentation

Unit tests:

noctua 1.5.1

Bug Fix

library(readr)
library(microbenchmark)

# creating some dummy data for testing
X <- 1e8
df <- 
data.frame(
    w = runif(X),
    x = 1:X,
    y = sample(letters, X, replace = T), 
    z = sample(c(TRUE, FALSE), X, replace = T))
write_csv(df, "test.csv")

# read in text file into raw format
obj <- readBin("test.csv", what = "raw", n = file.size("test.csv"))

format(object.size(obj), units = "auto")
# 3.3 Gb

# writeBin in a loop
write_bin <- function(
  value,
  filename,
  chunk_size = 2L ^ 20L) {
  
  total_size <- length(value)
  split_vec <- seq(1, total_size, chunk_size)
  
  con <- file(filename, "a+b")
  on.exit(close(con))
  
  sapply(split_vec, function(x){writeBin(value[x:min(total_size,(x+chunk_size-1))],con)})
  invisible(TRUE)
}


microbenchmark(writeBin_loop = write_bin(obj, tempfile()),
               readr = write_file(obj, tempfile()),
               times = 5)

# Unit: seconds
# expr       min       lq      mean    median        uq       max neval
# R_loop 41.463273 41.62077 42.265778 41.908908 42.022042 44.313893     5
# readr  2.291571  2.40495  2.496871  2.542544  2.558367  2.686921     5
# Before
translate_sql("2019-01-01", con = con)
# '2019-01-01'

# Now
translate_sql("2019-01-01", con = con)
# DATE '2019-01-01'
# R code:
paste("hi", "bye", sep = "-")

# SQL translation:
('hi'||'-'||'bye')
library(DBI)
library(dplyr)

con <- dbConnect(noctua::athena())

tbl(con, "iris") %>%
  compute(name = "temp.iris")

New Feature

library(DBI)
library(dplyr)

con <- dbConnect(noctua::athena())

# ident method:
t1 <- system.time(tbl(con, "iris"))

# sub query method:
t2 <- system.time(tbl(con, sql("select * from iris")))

# ident method
# user  system elapsed 
# 0.082   0.012   0.288 

# sub query method
# user  system elapsed 
# 0.993   0.138   3.660 

Unit test

noctua 1.5.0

New Feature

library(noctua)

noctua_options("vroom")

Unit tests

Documentation

noctua 1.4.0

Major Change

warning('Appended `file.type` is not compatible with the existing Athena DDL file type and has been converted to "', File.Type,'".', call. = FALSE)

Bug fix

Unit Tests

New Feature

Minor Change

noctua 1.3.0

Major Change

Performance results

library(DBI)
X <- 1e8
df <- data.frame(w =runif(X),
                 x = 1:X,
                 y = sample(letters, X, replace = T), 
                 z = sample(c(TRUE, FALSE), X, replace = T))
con <- dbConnect(noctua::athena())
# upload dataframe with different splits
dbWriteTable(con, "test_split1", df, compress = T, max.batch = nrow(df), overwrite = T) # no splits
dbWriteTable(con, "test_split2", df, compress = T, max.batch = 0.05 * nrow(df), overwrite = T) # 20 splits
dbWriteTable(con, "test_split3", df, compress = T, max.batch = 0.1 * nrow(df), overwrite = T) # 10 splits

AWS Athena performance results from AWS console (query executed: select count(*) from .... ):

library(DBI)
X <- 1e8
df <- data.frame(w =runif(X),
                 x = 1:X,
                 y = sample(letters, X, replace = T), 
                 z = sample(c(TRUE, FALSE), X, replace = T))
con <- dbConnect(noctua::athena())
dbWriteTable(con, "test_split1", df, compress = T, overwrite = T) # default will now split compressed file into 20 equal size files.

Added information message to inform user about what files have been added to S3 location if user is overwriting an Athena table.

Minor Change

Bug Fix

Unit tests

noctua 1.2.1

New Features:

Bug fixed

noctua 1.2.0

Minor Change

Backend Change

library(DBI)

con <- dbConnect(noctua::athena())

dbWriteTable(con, "iris", iris)

Bug Fix

Unit Tests

New Feature

Minor Change

noctua 1.1.0

New Features

Bug fix

Unit Tests

Minor Change

Major Change

noctua 1.0.0

New Features

DBI

Athena lower level api