Monday, December 4, 2023

Loading JSON Data using External Tables in Oracle Database 23c

Loading JSON Data using External Tables in Oracle Database 23c

It was possible to load JSON documents using external tables in previous versions of the database, but they had to be loaded as CLOBs, and depending on the format it might require some creative external table definitions. In Oracle database 23c loading JSON data using external tables got much simpler.

◉ Directory Object


We create a directory object pointing to the "/tmp" directory on the database server. We make sure our test user can access this directory object.

conn sys/SysPassword1@//localhost:1521/freepdb1 as sysdba

create or replace directory tmp_dir as '/tmp';
grant read, write on directory tmp_dir to testuser1;

conn testuser1/testuser1@//localhost:1521/freepdb1

◉ Array of JSON Objects


We create a file called "fruit_array.json" containing an array of JSON documents.

cat > /tmp/fruit_array.json <<EOF
[
  {"fruit":"apple", "quantity": 20},
  {"fruit":"orange", "quantity": 10},
  {"fruit":"banana", "quantity": 15}
]
EOF

We create an external table pointing at the "fruit_array.json" file. Notice the type is "oracle_bigdata" and the access parameters are set to "com.oracle.bigdata.fileformat = jsondoc". This access type understands JSON documents, which means we don't need to figure out how to describe the contents of the file so they can be loaded accurately.

drop table if exists json_ext;

create table json_ext (
  data json
)
  organization external
    (type oracle_bigdata
     access parameters (com.oracle.bigdata.fileformat = jsondoc)
     location (tmp_dir:'fruit_array.json'))
  parallel
  reject limit unlimited;

We query the data from the "fruit_array.json" file and we see the data is loaded as one JSON document. We use JSON_SERIALIZE to pretty print it.

select json_serialize(data pretty)
from   json_ext;

JSON_SERIALIZE(DATAPRETTY)
--------------------------------------------------------------------------------
[
  {
    "fruit" : "apple",
    "quantity" : 20
  },
  {
    "fruit" : "orange",
    "quantity" : 10
  },
  {
    "fruit" : "banana",
    "quantity" : 15
  }
]


1 row selected.

SQL>

If we wanted to see each document in the array as a separate row, we just indicate the JSON Path of the array in our external table definition.

drop table if exists json_ext;

create table json_ext (
  data json
)
  organization external
    (type oracle_bigdata
     access parameters (
                         com.oracle.bigdata.json.path = '$[*]'
                         com.oracle.bigdata.fileformat = jsondoc
                       )
     location (tmp_dir:'fruit_array.json'))
  parallel
  reject limit unlimited;

Now when we query the data we see each array element presented as a separate row.

select json_serialize(data pretty)
from   json_ext;

JSON_SERIALIZE(DATAPRETTY)
--------------------------------------------------------------------------------
{
  "fruit" : "apple",
  "quantity" : 20
}

{
  "fruit" : "orange",
  "quantity" : 10
}

{
  "fruit" : "banana",
  "quantity" : 15
}


3 rows selected.

SQL>

Alternatively we may see the array wrapped up in a document as follows. As long as we set the JSON path to the array correctly, we can still unpack this array.

cat > /tmp/fruit_array2.json <<EOF
{
  "rows": [
    {"fruit":"apple", "quantity": 20},
    {"fruit":"orange", "quantity": 10},
    {"fruit":"banana", "quantity": 15}
  ]
}
EOF

We recreate the external table using the new JSON path.

drop table if exists json_ext;

create table json_ext (
  data json
)
  organization external
    (type oracle_bigdata
     access parameters (
                         com.oracle.bigdata.json.path = '$.rows[*]'
                         com.oracle.bigdata.fileformat = jsondoc
                       )
     location (tmp_dir:'fruit_array2.json'))
  parallel
  reject limit unlimited;

We query the external table and see the array elements as separate rows.

select json_serialize(data pretty)
from   json_ext;

JSON_SERIALIZE(DATAPRETTY)
--------------------------------------------------------------------------------
{
  "fruit" : "apple",
  "quantity" : 20
}

{
  "fruit" : "orange",
  "quantity" : 10
}

{
  "fruit" : "banana",
  "quantity" : 15
}


3 rows selected.

SQL>

◉ JSON Document Per File


We create three files called "fruit_file{n}.json", each with a single JSON document.

cat > /tmp/fruit_file1.json <<EOF
{"fruit":"apple", "quantity": 20}
EOF

cat > /tmp/fruit_file2.json <<EOF
{"fruit":"orange", "quantity": 10}
EOF

cat > /tmp/fruit_file3.json <<EOF
{"fruit":"banana", "quantity": 15}
EOF

We recreate the external table with the location parameter pointing to multiple "fruit_file*.json" files using a wildcard. This time each document is loaded as a separate row.

drop table if exists json_ext;

create table json_ext (
  data json
)
  organization external
    (type oracle_bigdata
     access parameters (com.oracle.bigdata.fileformat = jsondoc)
     location (tmp_dir:'fruit_file*.json'))
  parallel
  reject limit unlimited;

SELECT json_serialize(data pretty)
FROM   json_ext;

JSON_SERIALIZE(DATAPRETTY)
--------------------------------------------------------------------------------
{
  "fruit" : "apple",
  "quantity" : 20
}

{
  "fruit" : "orange",
  "quantity" : 10
}

{
  "fruit" : "banana",
  "quantity" : 15
}

3 rows selected.

SQL>

◉ Multiple JSON Documents Per File


We create a file called "fruit_multidoc.json" containing several JSON documents, with each document on a new line.

cat > /tmp/fruit_multidoc.json <<EOF
{"fruit":"apple", "quantity": 20}
{"fruit":"orange", "quantity": 10}
{"fruit":"banana", "quantity": 15}
EOF

We could recreate the external table, but this time we will override the location in the query. We point the table to the "fruit_multidoc.json" file. Even though the documents are read from a single file, they are loaded as a separate row per document.

SELECT json_serialize(data pretty)
FROM   json_ext external modify (
                   location (tmp_dir:'fruit_multidoc.json') 
                 );

JSON_SERIALIZE(DATAPRETTY)
--------------------------------------------------------------------------------
{
  "fruit" : "apple",
  "quantity" : 20
}

{
  "fruit" : "orange",
  "quantity" : 10
}

{
  "fruit" : "banana",
  "quantity" : 15
}


3 rows selected.

SQL>

◉ Multiline JSON Documents


We create a file called "fruit_multidoc_multiline.json" containing three JSON documents. Each document is spread across multiple lines.

cat > /tmp/fruit_multidoc_multiline.json <<EOF
{
  "fruit":"apple",
  "quantity": 20,
  "extra_info": {"val1": "one",
                 "val2": "two"}
}
{
  "fruit":"orange",
  "quantity": 10,
  "extra_info": {"val1": "three",
                 "val2": "four"}
}
{
  "fruit":"banana",
  "quantity": 15,
  "extra_info": {"val1": "five",
                 "val2": "six"}
}
EOF

We point the table to the "fruit_multidoc_multiline.json" file. The "jsondoc" format is smart enough to understand JSON, so we can have JSON documents that span multiple lines, and they will still be loaded correctly.

Once again we have one row per document. The external table recognized each document was spread across multiple lines, and loaded it correctly.

SELECT json_serialize(data pretty)
FROM   json_ext external modify (
                   location (tmp_dir:'fruit_multidoc_multiline.json') 
                 );

JSON_SERIALIZE(DATAPRETTY)
--------------------------------------------------------------------------------
{
  "fruit" : "apple",
  "quantity" : 20,
  "extra_info" :
  {
    "val1" : "one",
    "val2" : "two"
  }
}

{
  "fruit" : "orange",
  "quantity" : 10,
  "extra_info" :
  {
    "val1" : "three",
    "val2" : "four"
  }
}

{
  "fruit" : "banana",
  "quantity" : 15,
  "extra_info" :
  {
    "val1" : "five",
    "val2" : "six"
  }
}

3 rows selected.

SQL>

◉ Loading JSON Data From External Tables


Now we can read the data using external tables, loading the data into our database can take one of two forms.

- We load the JSON data directly into JSON columns.
- We use the built JSON functions to convert the JSON data into rows and columns in relation tables.

Here are examples of each method.

-- JSON column
drop table if exists json_data_table;

create table json_data_table (
  id        number generated always as identity,
  json_data json
);


insert into json_data_table (json_data)
select data from json_ext;
commit;

-- Relational table.
drop table if exists relational_data_table;

create table relational_data_table (
  id        number generated always as identity,
  fruit     varchar2(10),
  quantity  number
);

-- Dot Notation
insert into relational_data_table(fruit, quantity)
select je.data.fruit,
       je.data.quantity
from   json_ext je;

-- JSON_VALUE
insert into relational_data_table(fruit, quantity)
select json_value(data, '$.fruit'),
       json_value(data, '$.quantity')
from   json_ext;
commit;

Source: oracle-base.com

Related Posts

0 comments:

Post a Comment