Вы находитесь на странице: 1из 7

Session 5

HIVE EXERCISE

A. Create Database
----------------------create database retail;

B. Select Database
-----------------use retail;

C. Create table for storing transactional records


------------------------------------------------create table txnrecords(txnno INT, txndate STRING, custno INT, amount
DOUBLE,
category STRING, product STRING, city STRING, state STRING, spendby
STRING)
row format delimited
fields terminated by ','
stored as textfile;

D. Load the data into the table


------------------------------LOAD DATA LOCAL INPATH 'txns1.txt' OVERWRITE INTO TABLE
txnrecords;

E. Describing metadata or schema of the table


--------------------------------------------describe txnrecords;

F. Counting no of records
------------------------select count(*) from txnrecords;

G. Counting total spending by category of products


-------------------------------------------------select category, sum(amount) from txnrecords group by category;

H. 10 customers
-------------------select custno, sum(amount) from txnrecords group by custno limit 10;

I. Create partitioned table


--------------------------create table txnrecsByCat(txnno INT, txndate STRING, custno INT, amount
DOUBLE,
product STRING, city STRING, state STRING, spendby STRING)
partitioned by (category STRING)
clustered by (state) INTO 10 buckets
row format delimited
fields terminated by ','

stored as textfile;

J. Configure Hive to allow partitions


-------------------------------------

However, a query across all partitions could trigger an enormous MapReduce job if
the table data and number of partitions are large. A highly suggested safety
measure is putting Hive into strict mode, which prohibits queries of partitioned
tables without a WHERE clause that filters on partitions. You can set the mode to
nonstrict, as in the following session:

set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.dynamic.partition=true;
set hive.enforce.bucketing=true;

K. Load data into partition table


---------------------------------from txnrecords txn INSERT OVERWRITE TABLE txnrecsByCat
PARTITION(category)
select txn.txnno, txn.txndate,txn.custno, txn.amount,txn.product,txn.city,txn.state,
txn.spendby, txn.category DISTRIBUTE BY category;

==========================
find sales based on age group
==========================

create table customer(custno string, firstname string, lastname string, age


int,profession string)
row format delimited
fields terminated by ',';

load data local inpath '/home/cloudera/hive/custs' into table customer;

create table out1 (custno int,firstname string,age int,profession string,amount


double,product string)
row format delimited
fields terminated by ',';

insert overwrite table out1


select a.custno,a.firstname,a.age,a.profession,b.amount,b.product
from customer a JOIN txnrecords b ON a.custno = b.custno;

select * from out1 limit 100;

create table out2 (custno int,firstname string,age int,profession string,amount


double,product string, level string)
row format delimited
fields terminated by ',';

insert overwrite table out2


select * , case
when age<30 then 'low'
when age>=30 and age < 50 then 'middle'
when age>=50 then 'old'
else 'others'
end
from out1;

select * from out2 limit 100;

describe out2;

create table out3 (level string, amount double)


row format delimited
fields terminated by ',';

insert overwrite table out3


select level,sum(amount) from out2 group by level;

==============
simple join
==============

****emp.txt

****swetha,250000,Chennai
****anamika,200000,Kanyakumari
****tarun,300000,Pondi
****anita,250000,Selam

****email.txt
****swetha,swetha@gmail.com
****tarun,tarun@edureka.in
****nagesh,nagesh@yahoo.com
****venkatesh,venki@gmail.com

create table employee(name string, salary float,city string)


row format delimited
fields terminated by ',';

load data local inpath 'emp.txt' into table employee;

select * from employee where name='tarun';

create table mailid (name string, email string)


row format delimited
fields terminated by ',';

load data local inpath 'email.txt' into table mailid;

select a.name,a.city,a.salary,b.email from


employee a join mailid b on a.name = b.name;

select a.name,a.city,a.salary,b.email from


employee a left outer join mailid b on a.name = b.name;

select a.name,a.city,a.salary,b.email from


employee a right outer join mailid b on a.name = b.name;

select a.name,a.city,a.salary,b.email from


employee a full outer join mailid b on a.name = b.name;

Вам также может понравиться