Hive Complex Data Types

Array

$ vi arrayfile

1,abc,40000,a$b$c,hyd

2,def,3000,d$f,bang

3,abc,40000,a$b$c,hyd

4,def,3000,d$f,bang

5,abc,40000,a$b$c,hyd

6,def,3000,d$f,bang

7,abc,40000,a$b$c,hyd

8,def,3000,d$f,bang

9,abc,40000,a$b$c,hyd

10,def,3000,d$f$d$e$d$e$e$r$g,bang

hive> create table array_tab (id int, name string, salary bigint, sub array<string>, city string)

> row format delimited

> fields terminated by ','

> collection items terminated by '$';

hive> load data local inpath '/root/arrayfile' into table array_tab;

hive> select * from array_tab;

1 abc 40000 ["a","b","c"] hyd

2 def 3000 ["d","f"] bang

3 abc 40000 ["a","b","c"] hyd

4 def 3000 ["d","f"] bang

5 abc 40000 ["a","b","c"] hyd

6 def 3000 ["d","f"] bang

7 abc 40000 ["a","b","c"] hyd

8 def 3000 ["d","f"] bang

9 abc 40000 ["a","b","c"] hyd

10 def 3000 ["d","f","d","e","d","e","e","r","g"] bang

hive> describe array_tab;

id int

name string

salary bigint

sub array<string>

city string

Time taken: 0.79 seconds, Fetched: 5 row(s)

hive> select sub[0] from array_tab where id=1;

….

Map

$ vi mapfile

1,abc,40000,a$b$c,pf#500$epf#200,hyd

2,def,3000,d$f,pf#500,bang

2,abc,40000,a$b$c,pf#500$epf#200,hyd

3,def,3000,d$f,pf#500,bang

4,abc,40000,a$b$c,pf#500$epf#200,hyd

5,def,3000,d$f,pf#500,bang

6,abc,40000,a$b$c,pf#500$epf#200,hyd

7,def,3000,d$f,pf#500,bang

8,abc,40000,a$b$c,pf#500$epf#200,hyd

hive> create table arr_map_tab (id int, name string, salary bigint, sub array<string>, details map<string, int>, city string)

> row format delimited

> fields terminated by ','

> collection items terminated by '$'

> map keys terminated by '#';

hive> load data local inpath 'mapfile' into table arr_map_tab;

hive> select * from arr_map_tab;

1 abc 40000 ["a","b","c"] {"pf":500,"epf":200} hyd

2 def 3000 ["d","f"] {"pf":500} bang

2 abc 40000 ["a","b","c"] {"pf":500,"epf":200} hyd

3 def 3000 ["d","f"] {"pf":500} bang

4 abc 40000 ["a","b","c"] {"pf":500,"epf":200} hyd

5 def 3000 ["d","f"] {"pf":500} bang

6 abc 40000 ["a","b","c"] {"pf":500,"epf":200} hyd

7 def 3000 ["d","f"] {"pf":500} bang

8 abc 40000 ["a","b","c"] {"pf":500,"epf":200} hyd

Time taken: 2.04 seconds, Fetched: 9 row(s)

hive> describe arr_map_tab;

id int

name string

salary bigint

sub array<string>

details map<string,int>

city string

Time taken: 0.838 seconds, Fetched: 6 row(s)

hive> select details["pf"] from arr_map_tab limit 1;

…

500

Time taken: 33.805 seconds, Fetched: 1 row(s)

Struct

$ vi structfile

1,abc,40000,a$b$c,pf#500$epf#200,hyd$ap$500001

2,def,3000,d$f,pf#500,bang$kar$600038

4,abc,40000,a$b$c,pf#500$epf#200,bhopal$MP$452013

5,def,3000,d$f,pf#500,Indore$MP$452014

hive> create table arr_map_struct_tab (id int, name string, salary bigint, sub array<string>, details map<string, int>, address struct<city:string, state:string, pin:int>)

> row format delimited
> fields terminated by ','
> collection items terminated by '$' > map keys terminated by #';
OK

Time taken: 4.982 seconds

hive> describe arr_map_struct_tab;

id int

name string

salary bigint

sub array<string>

details map<string,int>

address struct<city:string,state:string,pin:int>

Time taken: 1.416 seconds, Fetched: 6 row(s)

hive> load data local inpath 'structfile' into table arr_map_struct_tab;

hive> select * from arr_map_struct_tab;

1 abc 40000 ["a","b","c"] {"pf":500,"epf":200} {"city":"hyd","state":"ap","pin":500001}

2 def 3000 ["d","f"] {"pf":500} {"city":"bang","state":"kar","pin":600038}

4 abc 40000 ["a","b","c"] {"pf":500,"epf":200} {"city":"bhopal","state":"MP","pin":452013}

5 def 3000 ["d","f"] {"pf":500} {"city":"Indore","state":"MP","pin":452014}

Time taken: 1.226 seconds, Fetched: 4 row(s)

hive> select address.city from arr_map_struct_tab where details["pf"]="500" and sub[0]="a";

…

hyd

bhopal

Time taken: 20.286 seconds, Fetched: 2 row(s)

Uniontype

hive> CREATE TABLE union_tab(col1 UNIONTYPE<INT, DOUBLE, STRING, ARRAY<string>, STRUCT<a:INT,b:string>>)

> row format delimited

> fields terminated by ','

> COLLECTION ITEMS TERMINATED BY '|'

> LINES TERMINATED BY '\n';

Time taken: 2.356 seconds

$ vi unionfile

0|1

0|12

1|1.234

1|2.3456

2|dinesh

2|Dinesh Sachdev

hive> load data local inpath 'unionfile' overwrite into table union_tab;

hive> select * from union_tab;

{0:1}

{0:12}

{1:1.234}

{1:2.3456}

{2:"dinesh"}

{2:"Dinesh Sachdev"}

Time taken: 1.211 seconds, Fetched: 6 row(s)

It becomes quiet simple to load data into uniontype for primitives. But what about complex types? For example if we edit ‘unionfile’ and append an array:

$vi unionfile

0|1

0|12

1|1.234

1|2.3456

2|dinesh

2|Dinesh Sachdev

3|din|esh|sach|dev

hive> load data local inpath 'unionfile' overwrite into table union_tab;

hive> select * from union_tab;

{0:1}

{0:12}

{1:1.234}

{1:2.3456}

{2:"dinesh"}

{2:"Dinesh Sachdev"}

{3:["din|esh|sach|dev"]}

Time taken: 1.11 seconds, Fetched: 7 row(s)

There is only a single element in array whereas we expected to have array of 4 strings [“din”,”esh”,”sach”,”dev”]

For this we can use create_union UDF:

hive> insert into table union_tab

> select create_union(4,1, 1.02,"d", array("d","f"), named_struct('a',1, 'b','dinesh')) from sample_07 limit 1;
...
...

hive> insert into table union_tab

> select create_union(3,1, 1.02,"d", array("d","f"), named_struct('a',1, 'b','dinesh')) from sample_07 limit 1;

hive> select * from union_tab;
OK
{4:{"a":1,"b":"dinesh"}}
{3:["d","f"]}
{0:1}
{0:12}
{1:1.234}
{1:2.3456}
{2:"dinesh"}
{2:"Dinesh Sachdev"}
{3:["din|esh|sach|dev"]}
Time taken: 0.064 seconds, Fetched: 9 row(s)

Comments

Sairam AsapuDecember 14, 2017 at 3:16 AM
boss, how can I query an union? suppose say I want to say "select from union_tab where "
ReplyDelete
Replies

Add comment

QueryDB

Search This Blog

Hive Complex Data Types

Comments

Post a Comment

Popular posts

Hive Parse JSON with Array Columns and Explode it in to Multiple rows.

org.apache.spark.sql.AnalysisException: Cannot overwrite a path that is also being read from.;

Read from a hive table and write back to it using spark sql

Hadoop Distcp Error Duplicate files in input path

Scala Spark building Jar leads java.lang.StackOverflowError