Thursday, October 6, 2016

Hive Lab6

Hive Lab6 : xml data processing 2

Hive xml2
______________________

hive> create database xxmls;
OK
Time taken: 0.182 seconds
hive> use xxmls;
OK
Time taken: 0.017 seconds

[training@localhost ~]$ cat xml5
<tr><cid>101</cid><pr>3000</pr><qnt>3</qnt><pr>5000</pr><qnt>2</qnt><pr>1000</pr><qnt>5</qnt></tr>
<tr><cid>102</cid><pr>2000</pr><qnt>5</qnt><pr>5000</pr><qnt>2</qnt></tr>
<tr><cid>101</cid><pr>6000</pr><qnt>5</qnt></tr>
[training@localhost ~]$ 


hive> create table raw(line string);
OK
Time taken: 0.252 seconds
hive> load data local inpath 'xml5' 
    >  into table raw;
Copying data from file:/home/training/xml5
Copying file: file:/home/training/xml5
Loading data to table xxmls.raw
OK
Time taken: 0.178 seconds
hive> select * from raw;
OK
<tr><cid>101</cid><pr>3000</pr><qnt>3</qnt><pr>5000</pr><qnt>2</qnt><pr>1000</pr><qnt>5</qnt></tr>
<tr><cid>102</cid><pr>2000</pr><qnt>5</qnt><pr>5000</pr><qnt>2</qnt></tr>
<tr><cid>101</cid><pr>6000</pr><qnt>5</qnt></tr>
Time taken: 0.155 seconds
hive> 


hive> create table raw2(cid int, 
    > pr array<string>, qnt array<string>);
OK
Time taken: 0.069 seconds
hive> insert overwrite table raw2
    >   select xpath_int(line,'tr/cid'),
    >     xpath(line,'tr/pr/text()'),
    >     xpath(line,'tr/qnt/text()') 
    > from raw;

hive> create table cidpr(cid int , pr int);
hive> insert overwrite table cidpr
    >   select cid, mypr from raw2
    >    lateral view explode(pr) p as mypr;

hive> create table cidqnt(cid int, qnt int);

hive> insert overwrite table cidqnt
    >    select cid, myqnt from raw2
    >   lateral view explode(qnt) q as myqnt;


hive> select * from cidpr;
OK
101     3000
101     5000
101     1000
102     2000
102     5000
101     6000
Time taken: 0.06 seconds
hive> select * from cidqnt;
OK
101     3
101     2
101     5
102     5
102     2
101     5
Time taken: 0.046 seconds
hive> 


hive> add jar Desktop/hivejars.jar;  
hive> create temporary function auto 
    >  as 'hive.analytics.SeqNumber';


 hive> alter table cidpr add columns(n int);
 hive> insert overwrite table cidpr
    select cid, pr, auto() from cidpr;

 hive> alter table cidqnt add columns(n int);
 hive> insert overwrite table cidqnt
    select cid, qnt, auto() from cidqnt;

 hive> create table trans(cid int, pr int, 
             qnt int, bill int);
 hive> insert overwrite table trans
       select l.cid, pr, qnt, pr*qnt
    from cidpr l join cidqnt r
           on (l.n=r.n);

 hive> create table report(cid int, totbill int);
 hive> insert overwrite table report
    select cid, sum(bill) from trans
        group by cid;
 hive> select * from report;
________________________

hive.analytics.SeqNumber.java
______________________
package hive.analytics;

import java.io.IOException;

import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.IntWritable;

public class SeqNumber extends UDF
{
 int cnt = 0;
public IntWritable  evaluate()
throws IOException
{
cnt++;
return  new IntWritable(cnt);
}


}

______________________________

keep all above statements into a script file.
   
   hscript1.hql
 ________________

 create database urheroes;
 use urheroes;
 create table raw(line string);
 load data local inpath 'xml5' into table raw;
 create table raw2(cid int, pr array<string>, qnt array<string>);
 insert overwrite table raw2
  select xpath_int(line,'tr/cid'),
    xpath(line,'tr/pr/text()'),
    xpath(line,'tr/qnt/text()') from raw;
 create table cidpr(cid int, pr int);
 insert overwrite table cidpr
   select cid, mypr from raw2
    lateral view explode(pr) p as mypr;
 create table cidqnt(cid int, qnt int);
 insert overwrite table cidqnt
   select cid, myq from raw2
    lateral view explode(qnt) q as myq;
 alter table cidpr add columns(n int);
 alter table cidqnt add columns(n int);
 add jar Desktop/hivejars.jar;
 create temporary function auto 
   as 'hive.analytics.SeqNumber';
 insert overwrite table cidpr
   select cid, pr, auto() from cidpr;
 insert overwrite table cidqnt
   select cid, qnt, auto() from cidqnt;
 create table trans(cid int, pr int, qnt int, 
   bill int);
 insert overwrite table trans
   select l.cid, pr, qnt, pr*qnt
   from cidpr l join cidqnt r
   on (l.n=r.n);
 create table report(cid int, totbill int);
 insert overwrite table report
   select cid, sum(bill) from trans
    group by cid;
 select * from trans;
 select * from report;
 drop table raw;
 drop table raw2;
 drop table cidpr;
 drop table cidqnt;
________________________

 how to submit script:

 $ hive -f hscript1.hql

Hive Lab5

Hive Lab5 : xml data processing

[training@localhost ~]$ cat sampxml1
<rec>
  <name>Ravi</name>
  <age>25</age>
  <sex>m</sex>
</rec>
<rec>
  <name>Rani</name>
  <sex>f</sex>
  <city>Hyd</city>
</rec>
[training@localhost ~]$ 

hive> create table samp(line string);
OK
Time taken: 0.309 seconds
hive> load data local inpath 'sampxml1'
    >  into table samp;
Copying data from file:/home/training/sampxml1
Copying file: file:/home/training/sampxml1
Loading data to table ourxml.samp
OK
Time taken: 0.202 seconds
hive> 

hive> select * from samp;
OK
<rec>
  <name>Ravi</name>
  <age>25</age>
  <sex>m</sex>
</rec>
<rec>
  <name>Rani</name>
  <sex>f</sex>
  <city>Hyd</city>
</rec>
Time taken: 0.184 seconds
hive> 

 above xml file has 2 records,
 when this file is loaded into hive table, hive treats, each line as a record,
  as per xml == two records.
  as per hive --> 10 records(rows.).

this data is not ready for hive,
hive expects total xml record in a single line.
so data preperation step is required.

Prepare a mapreduce Job , which can convert xml record into single row.

  (Later we see in Mapreduce session).,

[training@localhost ~]$ cat xml1
<rec><name>Ravi</name><age>25</age><city>hyd</city></rec>
<rec><name>Rani</name><age>24</age><sex>f</sex></rec>
<rec><name>Sampath</name><sex>m</sex><city>Del</city></rec>
[training@localhost ~]$ 


hive> create table raw(line string);
OK
Time taken: 0.05 seconds
hive> load data local inpath 'xml1'
    >  into table raw;
Copying data from file:/home/training/xml1
Copying file: file:/home/training/xml1
Loading data to table ourxml.raw
OK
Time taken: 0.123 seconds
hive> 


hive> select * from raw;
OK
<rec><name>Ravi</name><age>25</age><city>hyd</city></rec>
<rec><name>Rani</name><age>24</age><sex>f</sex></rec>
<rec><name>Sampath</name><sex>m</sex><city>Del</city></rec>
Time taken: 0.066 seconds
hive> select count(*) from raw;
3
-- 3 rows.

hive> select xpath_string(line,'rec/name') from raw;
Ravi
Rani
Sampath

hive> select 
    >  xpath_string(line,'rec/name'),
    >  xpath_int(line,'rec/age'),
    >  xpath_string(line,'rec/sex'),
    >  xpath_string(line,'rec/city')
    > from raw;

Ravi    25              hyd
Rani    24      f
Sampath 0       m       Del

if string fields is missed, it returns blank string, if numeric field is missed it returns 0.


_________________

xml with nested tags.

[training@localhost ~]$ cat xml2
<rec><name><fname>Ravi</fname><lname>kumar</lname></name><age>25</age><contact><email><personal>ravi@gmail.com</personal><official>ravi@infy.com</official></email><phone><mobile>12345</mobile><office>12346</office><residence>12347</residence></phone></contact></rec>
[training@localhost ~]$ 

hive> create table xraw(line string);
OK
Time taken: 0.075 seconds
hive> load data local inpath 'xml2'
    >  into table xraw;

hive> select * from xraw;
OK
<rec><name><fname>Ravi</fname><lname>kumar</lname></name><age>25</age><contact><email><personal>ravi@gmail.com</personal><official>ravi@infy.com</official></email><phone><mobile>12345</mobile><office>12346</office><residence>12347</residence></phone></contact></rec>
Time taken: 0.064 seconds
hive> 

hive> create table info(fname string, 
    >   lname string, 
    > age int, personal_email string,
    >  official_email string,
    >  mobile string, office string, 
    >  residence string);
OK
Time taken: 0.042 seconds
hive> 

hive> insert overwrite table info
    >   select 
    > xpath_string(line,'rec/name/fname'),
    > xpath_string(line,'rec/name/lname'),
    > xpath_int(line,'rec/age'),          
    > xpath_string(line,'rec/contact/email/personal'),
    > xpath_string(line,'rec/contact/email/official'),
    > xpath_string(line,'rec/contact/phone/mobile'),
    > xpath_string(line,'rec/contact/phone/office'),
    > xpath_string(line,'rec/contact/phone/residence') 
    >  from xraw;

hive> select * from info;
OK
Ravi    kumar   25      ravi@gmail.com  ravi@infy.com     12345   12346   12347
Time taken: 0.064 seconds
hive> 
_____________

xml with collections.



[training@localhost ~]$ cat xml3
<rec><name>Amar</name><qual>Btech</qual><qual>Mtech</qual></rec>
<rec><name>Amala</name><qual>Bsc</qual><qual>Msc</qual><qual>Mtech</qual></rec>
<rec><name>Akash</name><qual>Btech</qual><qual>Mba</qual></rec>
[training@localhost ~]$ 

hive> create table yraw(line string);
OK
Time taken: 0.043 seconds
hive> load data local inpath 'xml3'
    >  into table yraw;

hive> select * from yraw;
<rec><name>Amar</name><qual>Btech</qual><qual>Mtech</qual></rec>
<rec><name>Amala</name><qual>Bsc</qual><qual>Msc</qual><qual>Mtech</qual></rec>
<rec><name>Akash</name><qual>Btech</qual><qual>Mba</qual></rec>

hive> 
hive> create table raw2(name string, 
    >    qual  array<string>);
OK
Time taken: 0.039 seconds
hive> insert overwrite table raw2
    >   select xpath_string(line,'rec/name'),
    >  xpath(line,'rec/qual/text()')
    >   from yraw;

hive> select * from raw2;
OK
Amar    ["Btech","Mtech"]
Amala   ["Bsc","Msc","Mtech"]
Akash   ["Btech","Mba"]
Time taken: 0.061 seconds
hive> 

hive> select name, size(qual) from raw2;

Amar    2
Amala   3
Akash   2

how to access array elements,

 by using index numbers

 indexing starts from 0.

hive> select qual[0], qual[1],
          qual[2] from raw2;
Btech   Mtech   NULL
Bsc     Msc     Mtech
Btech   Mba     NULL

search for elements with in array.

hive> select * from raw2 
    >  where array_contains(qual,'Mtech');


Amar    ["Btech","Mtech"]
Amala   ["Bsc","Msc","Mtech"]

_______________
Flattening Array elements:

hive> select explode(qual) as q 
   from raw2;

Btech
Mtech
Bsc
Msc
Mtech
Btech
Mba


hive> select name, explode(qual) as q from raw2;
FAILED: Error in semantic analysis: UDTF's are not supported outside the SELECT clause, nor nested in expressions
hive> 

-- above statement is invalid,
   bcoz, udtf s can not be applied with other column expressions.

hive> create table  yinfo(name string, 
    >      qual string);
OK
Time taken: 0.035 seconds
hive> insert overwrite table yinfo 
    >   select name, myq from raw2
    >    lateral view explode(qual) q as myq;

hive> select * from yinfo;
OK
Amar    Btech
Amar    Mtech
Amala   Bsc
Amala   Msc
Amala   Mtech
Akash   Btech
Akash   Mba
Time taken: 0.055 seconds

hive> select * from yinfo   
    >   where qual in ('Msc','Mtech');


Amar    Mtech
Amala   Msc
Amala   Mtech

hive> create table yres(qual string, cnt int);
OK
Time taken: 0.036 seconds
hive> insert overwrite table yres
    >   select qual, count(*) from yinfo
    >    group by qual;

hive> select * from yres;
OK
Bsc     1
Btech   2
Mba     1
Msc     1
Mtech   2
Time taken: 0.051 seconds
hive> 

____________________

Assignment:

[training@localhost ~]$ cat xml4
<tr><cid>101</cid><pr>1000</pr><pr>3000</pr></tr>
<tr><cid>102</cid><pr>1200</pr><pr>2000</pr><pr>5000</pr></tr>
<tr><cid>101</cid><pr>4000</pr></tr>
[training@localhost ~]$ 

hive> create database sales;
hive> use sales;
hive> create table raw(line string);
hive> load data local inpath 'xml4'
   into table raw;
hive> create table raw2(cid string, 
    pr array<string>);

hive> insert overwrite table raw2
   select xpath_string(line,'tr/cid'),
   xpath(line,'tr/pr/text()')
    from raw;
hive> create table raw3(cid string, 
   pr int);
hive> insert overwrite table raw3
    select cid, mypr from  raw2
   lateral view explode(pr) p as mypr;

hive> create table results(cid string, 
   totbill int);
hive> insert overwrite table results
   select cid, sum(pr)
   from raw3
   group by cid;

hive> select * from results;

--- my students are really super and energitic --