Вы находитесь на странице: 1из 7

pig

# Loading the data file which is on HDFS.


load_student_data = LOAD '/input_files/student_data.txt' USING PigStorage(',');

load_student_data = LOAD '/input_files/student_data.txt' USING PigStorage(',')


as(id:int,first_name:chararray,last_name:chararray,contact:int,city:chararray);

# To see the result of the above relation


dump load_student_data;

#storing the data inside a file


# File will get created at hdfs
STORE load_student_data INTO '/output_folder_name' Using PigStorage(',');

# These folder can be created at fly but parent folder will be create on the fly
only at once.
STORE load_student_data INTO '/PigOutput/output_folder_name' USING PigStorage(',');

# Diagnostic operations are used to Diagnose the relations.

# This operation is use to run the relation using MapReduce job and see the output.
Dump relation_name;

# describe operator is used to see the schema of the relation, but the schema has
to be provided at the time of loading.
Describe relation_name;

# Illustrate operator gives us step by step execution of sequence of stataments


Illustrate relation_name;

===================================================================================
====================================================
-------------------------------------------------------- GROPUING AND JOINING
---------------------------------------------------------
===================================================================================
====================================================

# Grouping a relation with respect to some field.

load_student_details = LOAD '/input_files/student_details.txt' USING


PigStorage(',')
as(ID:int,first_name:chararray,last_name:chararray,Age:int,contact:long,city:charar
ray);

group_students = GROUP load_student_details by Age;


dump group_students;

# Grouping can be done on any number of fields.

group_students_with_agecity = GROUP load_student_details by (Age,city);


dump group_students_with_agecity;
# ALL operator can be used for grouping all the tuples of a relation.

load_student_details = LOAD '/input_files/student_details.txt' USING


PigStorage(',')
as(ID:int,first_name:chararray,last_name:chararray,Age:int,contact:long,city:charar
ray);

group_student_with_all = GROUP load_student_details ALL;


dump group_student_with_all;

# COGROUP operator works in the same way as group. Only difference is that group
works on one relation, while
# COGROUP works on two or more relations.
load_student_details = LOAD '/input_files/student_details.txt' USING
PigStorage(',')
as(ID:int,first_name:chararray,last_name:chararray,Age:int,contact:long,city:charar
ray);

load_employee_details = LOAD '/input_files/employee_details.txt' USING


PigStorage(',') as(ID:int,first_name:chararray,Age:int,city:chararray);

# COGROUP operator
cogroup_data = COGROUP load_student_details by Age, load_employee_details by Age;
dump cogroup_data;

===================================================================================
===============================================
---------------------------------------------------------- JOIN
------------------------------------------------------------------
===================================================================================
===============================================

# Join Operators are used to combine the records from the two relations.

# Join Operator

# Self-Join
# Inner-Join
# Outer-Join : left join, right join and full join

# Self Join is used to join a table with itself

# Loading a data and creating a relation....


customers1 = LOAD '/input_files/customers.txt' USING PigStorage(',')as (id:int,
name:chararray, age:int, address:chararray, salary:int);

# Loading the same data and creating a relation with the different name....
customers2 = LOAD '/input_files/customers.txt' USING PigStorage(',')as (id:int,
name:chararray, age:int, address:chararray, salary:int);
# Applying the self join
Self_Join12 = JOIN customers1 BY id, customers2 BY id;

# Seeing the result using dump


dump Self_Join12;

# Now, trying self join on customers1.txt file which have id which is occuring 2
times.

customers3 = LOAD '/input_files/customers1.txt' USING PigStorage(',')as (id:int,


name:chararray, age:int, address:chararray, salary:int);

customers4 = LOAD '/input_files/customers1.txt' USING PigStorage(',')as (id:int,


name:chararray, age:int, address:chararray, salary:int);

# Applying the self join


Self_Join34 = JOIN customers3 by id, customers4 by id;

# Seeing the result using dump operator ....


dump Self_Join34;

# INNER JOIN

customers = LOAD '/input_files/customers.txt' USING PigStorage(',')as (id:int,


name:chararray, age:int, address:chararray, salary:int);

orders = LOAD '/input_files/orders.txt' USING PigStorage(',')as (oid:int,


date:chararray, customer_id:int, amount:int);

# Inner join is also refered as equijoin and it is refered quite frequently


# It creates a new relation based on two column values of two relations.
# When the join predicate is satisfied, the column values for each matched pair of
row A and row B are combiined to
# into a result row.

# Loading the customers data ....


customers = LOAD '/input_files/customers.txt' USING PigStorage(',')as (id:int,
name:chararray, age:int, address:chararray, salary:int);

# Loading the orders data ....


orders = LOAD '/input_files/orders.txt' USING PigStorage(',')as (oid:int,
date:chararray, customer_id:int, amount:int);

Inner_Join = JOIN customers BY id, orders BY customer_id;

# Seeing the result of the Inner_Join .....


dump Inner_Join;

# This above command will give the output as intersection, means only those records
of a relation will get joined which have got
# matched with other relation.
# Outer Join
# outer join is also of three types:-
# Left Outer Join
# Right Outer Join
# Full Join

# Left Outer Join returns all the rows from left table or relation even though
there are no match in the right relation.

# Load customers and orders relations if they are not already loaded.

# Applying the left outer join.


left_outer_join = JOIN customers by id LEFT OUTER, orders by customer_id;

# Seeing the output created by above relation....


dump left_outer_join;

# Right Outer Join returns all the rows from right table or relation even though
there are no match in the left relation.

# Load customers and orders relations if they are not already loaded.

# Applying the right outer join.


right_outer_join = JOIN customers by id RIGHT OUTER, orders by customer_id;

# Seeing the output created by above relation....


dump right_outer_join;

# Full Outer Join returns rows from both the relations even if there is no match in
relations or there is match
# in the relation ....

full_outer_join = JOIN customers BY id FULL OUTER, orders BY customer_id;

dump full_outer_join;

# customers relation is already created in the grunt shell

# Loading the orders2.txt using Pig


orders2 = LOAD '/input_files/orders2.txt' USING PigStorage(',')as (oid:int,
date:chararray, customer_id:int, amount:int);

# Applying the full outer join


full_outer_join2 = JOIN customers BY id FULL OUTER, orders2 BY customer_id;

# Seeing the output of the above relation


dump full_outer_join;

===================================================================================
===============================================
---------------------------------------------------------- CROSS
-----------------------------------------------------------------
===================================================================================
===============================================

# Cross operation will perform the cross products of the two or more relations....

# customers and orders relations have been loaded.

cross_data = CROSS customers, orders;

===================================================================================
===============================================
---------------------------------------------------------- UNION
-----------------------------------------------------------------
===================================================================================
===============================================

# Union Operator is used to combine the two relations. It means one relation will
come up and below it will be the second relation.

student1 = LOAD '/input_files/student_data.txt' USING PigStorage(',')


as (id:int, firstname:chararray, lastname:chararray, phone:chararray,
city:chararray);

student2 = LOAD '/input_files/student_data2.txt' USING PigStorage(',')


as (id:int, firstname:chararray, lastname:chararray, phone:chararray,
city:chararray);

union_data = UNION customers, orders;

dump union_data;

===================================================================================
===============================================
---------------------------------------------------------- SPLIT
-----------------------------------------------------------------
===================================================================================
===============================================

# Split operator is used for splitting a relation into 2 relations on the basis of
a certain condition.

student_details = LOAD '/input_files/student_details.txt' USING PigStorage(',')


as (id:int, firstname:chararray, lastname:chararray, age:int, phone:chararray,
city:chararray);

SPLIT student_details into student_details1 if age<23, student_details2 if age>=23;

===================================================================================
===============================================
---------------------------------------------------------- FILTER
----------------------------------------------------------------
===================================================================================
===============================================

# Filter operator is used to select the required tuples from a relation.

student_details = LOAD '/input_files/student_details.txt' USING PigStorage(',')


as (id:int, firstname:chararray, lastname:chararray, age:int, phone:chararray,
city:chararray);

filter_data = FILTER student_details by city == 'Pune';

dump filter_data;

===================================================================================
===============================================
---------------------------------------------------------- DISTINCT
--------------------------------------------------------------
===================================================================================
===============================================

# Distinct operator will remove the redundant data from the relation

student_details1 = LOAD '/input_files/student_details1.txt' USING PigStorage(',')


as (id:int, firstname:chararray, lastname:chararray, phone:chararray,
city:chararray);

distinct_data = DISTINCT student_details1;


dump student_details1;

===================================================================================
===============================================
---------------------------------------------------------- FOREACH
---------------------------------------------------------------
===================================================================================
===============================================

# Foreach operator is used for generating specified data from relation or relation
which has gone through some transformation.

student_details = LOAD '/input_files/student_details.txt' USING PigStorage(',')


as (id:int, firstname:chararray, lastname:chararray,age:int, phone:chararray,
city:chararray);

foreach_data = FOREACH student_details GENERATE id,age,city;

dump foreach_data;

===================================================================================
===============================================
---------------------------------------------------------- ORDER BY
--------------------------------------------------------------
===================================================================================
===============================================
# Order by operator is used for displaying content of relation in a particular
order.

student_details = LOAD '/input_files/student_details.txt' USING PigStorage(',')


as (id:int, firstname:chararray, lastname:chararray,age:int, phone:chararray,
city:chararray);

order_data = ORDER student_details BY id DESC;

dump order_data;

===================================================================================
===============================================
---------------------------------------------------------- LIMIT
-----------------------------------------------------------------
===================================================================================
===============================================

# Limit operator is used for limiting the number of tuples from the relation as an
output

student_details = LOAD '/input_files/student_details.txt' USING PigStorage(',')


as (id:int, firstname:chararray, lastname:chararray,age:int, phone:chararray,
city:chararray);

limit_data = LIMIT student_details 4;

dump limit_data;

Вам также может понравиться