This topic describes the file formats supported by DLA.

​CREATE EXTERNAL TABLE IF NOT EXISTS test_avro (
    L_ORDERKEY INT,
    L_PARTKEY INT,
    L_SUPPKEY INT,
    L_LINENUMBER INT,
    L_QUANTITY DOUBLE,
    L_EXTENDEDPRICE DOUBLE,
    L_DISCOUNT DOUBLE,
    L_TAX DOUBLE,
    L_RETURNFLAG STRING,
    L_LINESTATUS STRING,
    L_SHIPDATE DATE,
    L_COMMITDATE DATE,
    L_RECEIPTDATE DATE,
    L_SHIPINSTRUCT STRING,
    L_SHIPMODE STRING,
    L_COMMENT STRING
)
STORED AS AVRO
LOCATION 'oss://bucket001/datasets/test/test_avro/';​

ORC

​CREATE EXTERNAL TABLE test_orc (
    L_ORDERKEY INT,
    L_PARTKEY INT,
    L_SUPPKEY INT,
    L_LINENUMBER INT,
    L_QUANTITY DOUBLE,
    L_EXTENDEDPRICE DOUBLE,
    L_DISCOUNT DOUBLE,
    L_TAX DOUBLE,
    L_RETURNFLAG STRING,
    L_LINESTATUS STRING,
    L_SHIPDATE DATE,
    L_COMMITDATE DATE,
    L_RECEIPTDATE DATE,
    L_SHIPINSTRUCT STRING,
    L_SHIPMODE STRING,
    L_COMMENT STRING
)
STORED AS ORC
LOCATION 'oss://bucket001/datasets/test/test_orc';​

Parquet

​CREATE EXTERNAL TABLE test_parquet_hive_serde (
    L_ORDERKEY INT,
    L_PARTKEY INT,
    L_SUPPKEY INT,
    L_LINENUMBER INT,
    L_QUANTITY DOUBLE,
    L_EXTENDEDPRICE DOUBLE,
    L_DISCOUNT DOUBLE,
    L_TAX DOUBLE,
    L_RETURNFLAG STRING,
    L_LINESTATUS STRING,
    L_SHIPDATE DATE,
    L_COMMITDATE DATE,
    L_RECEIPTDATE DATE,
    L_SHIPINSTRUCT STRING,
    L_SHIPMODE STRING,
    L_COMMENT STRING
)
STORED AS PARQUET
LOCATION 'oss://bucket001/datasets/test/test_parquet_hive_serde';​

RcFile

​CREATE EXTERNAL TABLE test_rcfile (
    L_ORDERKEY INT,
    L_PARTKEY INT,
    L_SUPPKEY INT,
    L_LINENUMBER INT,
    L_QUANTITY DOUBLE,
    L_EXTENDEDPRICE DOUBLE,
    L_DISCOUNT DOUBLE,
    L_TAX DOUBLE,
    L_RETURNFLAG STRING,
    L_LINESTATUS STRING,
    L_SHIPDATE DATE,
    L_COMMITDATE DATE,
    L_RECEIPTDATE DATE,
    L_SHIPINSTRUCT STRING,
    L_SHIPMODE STRING,
    L_COMMENT STRING
)
STORED AS RCFILE
LOCATION 'oss://bucket001/datasets/test/test_rcfile';​

TextFile: Normal

​CREATE EXTERNAL TABLE IF NOT EXISTS test_text_null (
    URL STRING,
    TITLE STRING
)
STORED AS TEXTFILE
LOCATION 'oss://bucket001/datasets/test/test_null/textfile/';​

TextFile: OpenCSV

When you use OpenCSVSerde, note the following items:

  • You can specify the delimiters, content quotes, and escape characters for fields in a row. Example: WITH SERDEPROPERTIES ("separatorChar" = ",", "quoteChar”\" = "`", "escapeChar" = "\");
  • Row delimiters for fields are not supported.
  • All fields must be of the STRING type.
  • For fields of other types, you can use SQL functions to convert the data type to STRING.
​CREATE EXTERNAL TABLE test_opencsv (
  id STRING,
  name STRING,
  location STRING,
  create_date STRING,
  create_timestamp STRING,
  longitude STRING,
  latitude STRING
) 
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
with serdeproperties(
"separatorChar"=",",
"quoteChar"="\"",
"escapeChar"="\\"
)
STORED AS TEXTFILE 
LOCATION 'oss://bucket001/datasets/test/test_opencsv';​

TextFile: MultiDelimitSerDe

MultiDelimitSerDe allows you to specify multiple characters as a column delimiter in text files.

​CREATE EXTERNAL TABLE test_csv_multidelimit (
  id STRING,
  name STRING,
  location STRING,
  create_date STRING,
  create_timestamp STRING,
  longitude STRING,
  latitude STRING
) 
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.MultiDelimitSerDe'
with serdeproperties(
"field.delim"="||"
)
STORED AS TEXTFILE 
LOCATION 'oss://bucket001/datasets/test/test_multidelimiter ';

RegexSerDe

RegexSerDe uses regular expressions to extract fields in data when data types are not specified.

​CREATE EXTERNAL TABLE IF NOT EXISTS test_regex (
  host STRING,
  identity STRING,
  `user` STRING,
  time STRING,
  request STRING,
  status STRING,
  size INT,
  referer STRING,
  agent STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
  "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*) (-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?"
)
STORED AS TEXTFILE
LOCATION 'oss://oss-cn-beijing-for-openanalytics-test/datasets/test/test_regex';​

JSON: org.apache.hadoop.hive.serde2.JsonSerDe

​CREATE EXTERNAL TABLE IF NOT EXISTS `customer_case_jiahe`.`single_latin1_broken` (
    `id` int ,
    `name` string ,
    `age` int 
)
STORED AS JSON
LOCATION 'oss://oss-cn-beijing-for-openanalytics-test/datasets/test/customer_case/jiahe/single_latin1_encode_broken_record.log';​

JSON: org.apache.hive.hcatalog.data.JsonSerDe

​CREATE external TABLE json_table_1 (
   docid string,
   user_1 struct<
               id:INT,
               username:string,
               name:string,
               shippingaddress:struct<
                                      address1:string,
                                      address2:string,
                                      city:string,
                                      state:string
                                      >,
               orders:array<
                            struct<
                                 itemid:INT,
                                  orderdate:string
                                  >
                              >
               >
   )
ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
LOCATION 'oss://oss-cn-beijing-for-openanalytics-test/datasets/test/json/hcatalog_serde/table_1/test_json.json';​

JSON EsriJsonSerDe

DLA allows you to use EsriJsonSerDe to process Esri ArcGIS files in JSON format. For details about the file format, see https://github.com/Esri/spatial-framework-for-hadoop/wiki/JSON-Formats.

​CREATE EXTERNAL TABLE IF NOT EXISTS california_counties
 (
 Name string,
 BoundaryShape binary
 )
ROW FORMAT SERDE 'com.esri.hadoop.hive.serde.EsriJsonSerDe'
STORED AS INPUTFORMAT 'com.esri.json.hadoop.EnclosedEsriJsonInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'oss://oss-cn-beijing-for-openanalytics-test/datasets/geospatial/california-counties/'​