Query and pivot JSON and NDJSON documents

j_query() executes a query against a JSON or NDJSON document, automatically inferring the type of data and path.

j_pivot() transforms a JSON array-of-objects to an object-of-arrays; this can be useful when forming a column-based tibble from row-oriented JSON / NDJSON.

Usage

j_query(
  data,
  path = "",
  object_names = "asis",
  as = "string",
  ...,
  n_records = Inf,
  verbose = FALSE,
  data_type = j_data_type(data),
  path_type = j_path_type(path)
)

j_pivot(
  data,
  path = "",
  object_names = "asis",
  as = "string",
  ...,
  n_records = Inf,
  verbose = FALSE,
  data_type = j_data_type(data),
  path_type = j_path_type(path)
)

Arguments

data: a character() JSON string or NDJSON records, or the name of a file or URL containing JSON or NDJSON, or an R object parsed to a JSON string using jsonlite::toJSON().
path: character(1) JSONpointer, JSONpath or JMESpath query string.
object_names: character(1) order data object elements "asis" (default) or "sort" before filtering on path.
as: character(1) return type. For j_query(), "string" returns JSON / NDJSON strings; "R" parses JSON / NDJSON to R using rules in as_r(). For j_pivot() (JSON only), use as = "data.frame" or as = "tibble" to coerce the result to a data.frame or tibble.
...: passed to jsonlite::toJSON when data is an R object.
n_records: numeric(1) maximum number of NDJSON records parsed.
verbose: logical(1) report progress when parsing large NDJSON files.
data_type: character(1) type of data; one of "json", "ndjson", or a value returned by j_data_type().
path_type: character(1) type of path; one of "JSONpointer", "JSONpath", "JMESpath". Inferred from path using j_path_type().

Details

j_pivot() transforms an 'array-of-objects' (typical when the JSON is a row-oriented representation of a table) to an 'object-of-arrays'. A simple example transforms an array of two objects each with three fields '[{"a": 1, "b": 2, "c": 3}, {"a": 4, "b": 5, "c": 6}]' to an object with three fields, each a vector of length 2 '{"a": [1, 4], "b": [2, 5], "c": [3, 6]}'. The object-of-arrays representation corresponds closely to an R data.frame or tibble, as illustrated in the examples.

j_pivot() with JMESpath paths are especially useful for transforming NDJSON to a data.frame or tibble

Examples

json <- '{
  "locations": [
    {"name": "Seattle", "state": "WA"},
    {"name": "New York", "state": "NY"},
    {"name": "Bellevue", "state": "WA"},
    {"name": "Olympia", "state": "WA"}
  ]
}'

j_query(json, "/locations/0/name")             # JSONpointer
#> [1] "Seattle"
j_query(json, "$.locations[*].name", as = "R") # JSONpath
#> [1] "Seattle"  "New York" "Bellevue" "Olympia" 
j_query(json, "locations[].state", as = "R")   # JMESpath
#> [1] "WA" "NY" "WA" "WA"

## a few NDJSON records from <https://www.gharchive.org/>
ndjson_file <-
    system.file(package = "rjsoncons", "extdata", "2023-02-08-0.json")
j_query(ndjson_file, "{id: id, type: type}")
#>  [1] "{\"id\":\"26939254345\",\"type\":\"DeleteEvent\"}"
#>  [2] "{\"id\":\"26939254358\",\"type\":\"PushEvent\"}"  
#>  [3] "{\"id\":\"26939254361\",\"type\":\"CreateEvent\"}"
#>  [4] "{\"id\":\"26939254365\",\"type\":\"CreateEvent\"}"
#>  [5] "{\"id\":\"26939254366\",\"type\":\"PushEvent\"}"  
#>  [6] "{\"id\":\"26939254367\",\"type\":\"PushEvent\"}"  
#>  [7] "{\"id\":\"26939254379\",\"type\":\"PushEvent\"}"  
#>  [8] "{\"id\":\"26939254380\",\"type\":\"IssuesEvent\"}"
#>  [9] "{\"id\":\"26939254382\",\"type\":\"PushEvent\"}"  
#> [10] "{\"id\":\"26939254383\",\"type\":\"PushEvent\"}"  

j_pivot(json, "$.locations[?@.state=='WA']", as = "string")
#> [1] "{\"name\":[\"Seattle\",\"Bellevue\",\"Olympia\"],\"state\":[\"WA\",\"WA\",\"WA\"]}"
j_pivot(json, "locations[?@.state=='WA']", as = "R")
#> $name
#> [1] "Seattle"  "Bellevue" "Olympia" 
#> 
#> $state
#> [1] "WA" "WA" "WA"
#> 
j_pivot(json, "locations[?@.state=='WA']", as = "data.frame")
#>       name state
#> 1  Seattle    WA
#> 2 Bellevue    WA
#> 3  Olympia    WA
j_pivot(json, "locations[?@.state=='WA']", as = "tibble")
#> # A tibble: 3 × 2
#>   name     state
#>   <chr>    <chr>
#> 1 Seattle  WA   
#> 2 Bellevue WA   
#> 3 Olympia  WA   

## use 'path' to pivot ndjson one record at at time
j_pivot(ndjson_file, "{id: id, type: type}", as = "data.frame")
#>             id        type
#> 1  26939254345 DeleteEvent
#> 2  26939254358   PushEvent
#> 3  26939254361 CreateEvent
#> 4  26939254365 CreateEvent
#> 5  26939254366   PushEvent
#> 6  26939254367   PushEvent
#> 7  26939254379   PushEvent
#> 8  26939254380 IssuesEvent
#> 9  26939254382   PushEvent
#> 10 26939254383   PushEvent

## 'org' is a nested element; extract it
j_pivot(ndjson_file, "org", as = "data.frame")
#>          id              login gravatar_id
#> 1  30044474          gitcoinco            
#> 2 123667276 johnbieren-testing            
#> 3 123277977       CMPUT404-W23            
#> 4 120284018       mornystannit            
#>                                              url
#> 1          https://api.github.com/orgs/gitcoinco
#> 2 https://api.github.com/orgs/johnbieren-testing
#> 3       https://api.github.com/orgs/CMPUT404-W23
#> 4       https://api.github.com/orgs/mornystannit
#>                                           avatar_url
#> 1  https://avatars.githubusercontent.com/u/30044474?
#> 2 https://avatars.githubusercontent.com/u/123667276?
#> 3 https://avatars.githubusercontent.com/u/123277977?
#> 4 https://avatars.githubusercontent.com/u/120284018?

## use j_pivot() to filter 'PushEvent' for organizations
path <- "[{id: id, type: type, org: org}]
             [?@.type == 'PushEvent' && @.org != null] |
                 [0]"
j_pivot(ndjson_file, path, as = "data.frame")
#>            id      type    org.id          org.login org.gravatar_id
#> 1 26939254358 PushEvent 123667276 johnbieren-testing                
#> 2 26939254382 PushEvent 123667276 johnbieren-testing                
#>                                          org.url
#> 1 https://api.github.com/orgs/johnbieren-testing
#> 2 https://api.github.com/orgs/johnbieren-testing
#>                                       org.avatar_url  org.id.1  org.login.1
#> 1 https://avatars.githubusercontent.com/u/123667276? 120284018 mornystannit
#> 2 https://avatars.githubusercontent.com/u/123667276? 120284018 mornystannit
#>   org.gravatar_id.1                                org.url.1
#> 1                   https://api.github.com/orgs/mornystannit
#> 2                   https://api.github.com/orgs/mornystannit
#>                                     org.avatar_url.1
#> 1 https://avatars.githubusercontent.com/u/120284018?
#> 2 https://avatars.githubusercontent.com/u/120284018?

## try also
##
##     j_pivot(ndjson_file, path, as = "tibble") |>
##         tidyr::unnest_wider("org", names_sep = ".")