Академический Документы
Профессиональный Документы
Культура Документы
# These folder can be created at fly but parent folder will be create on the fly
only at once.
STORE load_student_data INTO '/PigOutput/output_folder_name' USING PigStorage(',');
# This operation is use to run the relation using MapReduce job and see the output.
Dump relation_name;
# describe operator is used to see the schema of the relation, but the schema has
to be provided at the time of loading.
Describe relation_name;
===================================================================================
====================================================
-------------------------------------------------------- GROPUING AND JOINING
---------------------------------------------------------
===================================================================================
====================================================
# COGROUP operator works in the same way as group. Only difference is that group
works on one relation, while
# COGROUP works on two or more relations.
load_student_details = LOAD '/input_files/student_details.txt' USING
PigStorage(',')
as(ID:int,first_name:chararray,last_name:chararray,Age:int,contact:long,city:charar
ray);
# COGROUP operator
cogroup_data = COGROUP load_student_details by Age, load_employee_details by Age;
dump cogroup_data;
===================================================================================
===============================================
---------------------------------------------------------- JOIN
------------------------------------------------------------------
===================================================================================
===============================================
# Join Operators are used to combine the records from the two relations.
# Join Operator
# Self-Join
# Inner-Join
# Outer-Join : left join, right join and full join
# Loading the same data and creating a relation with the different name....
customers2 = LOAD '/input_files/customers.txt' USING PigStorage(',')as (id:int,
name:chararray, age:int, address:chararray, salary:int);
# Applying the self join
Self_Join12 = JOIN customers1 BY id, customers2 BY id;
# Now, trying self join on customers1.txt file which have id which is occuring 2
times.
# INNER JOIN
# This above command will give the output as intersection, means only those records
of a relation will get joined which have got
# matched with other relation.
# Outer Join
# outer join is also of three types:-
# Left Outer Join
# Right Outer Join
# Full Join
# Left Outer Join returns all the rows from left table or relation even though
there are no match in the right relation.
# Load customers and orders relations if they are not already loaded.
# Right Outer Join returns all the rows from right table or relation even though
there are no match in the left relation.
# Load customers and orders relations if they are not already loaded.
# Full Outer Join returns rows from both the relations even if there is no match in
relations or there is match
# in the relation ....
dump full_outer_join;
===================================================================================
===============================================
---------------------------------------------------------- CROSS
-----------------------------------------------------------------
===================================================================================
===============================================
# Cross operation will perform the cross products of the two or more relations....
===================================================================================
===============================================
---------------------------------------------------------- UNION
-----------------------------------------------------------------
===================================================================================
===============================================
# Union Operator is used to combine the two relations. It means one relation will
come up and below it will be the second relation.
dump union_data;
===================================================================================
===============================================
---------------------------------------------------------- SPLIT
-----------------------------------------------------------------
===================================================================================
===============================================
# Split operator is used for splitting a relation into 2 relations on the basis of
a certain condition.
===================================================================================
===============================================
---------------------------------------------------------- FILTER
----------------------------------------------------------------
===================================================================================
===============================================
dump filter_data;
===================================================================================
===============================================
---------------------------------------------------------- DISTINCT
--------------------------------------------------------------
===================================================================================
===============================================
# Distinct operator will remove the redundant data from the relation
===================================================================================
===============================================
---------------------------------------------------------- FOREACH
---------------------------------------------------------------
===================================================================================
===============================================
# Foreach operator is used for generating specified data from relation or relation
which has gone through some transformation.
dump foreach_data;
===================================================================================
===============================================
---------------------------------------------------------- ORDER BY
--------------------------------------------------------------
===================================================================================
===============================================
# Order by operator is used for displaying content of relation in a particular
order.
dump order_data;
===================================================================================
===============================================
---------------------------------------------------------- LIMIT
-----------------------------------------------------------------
===================================================================================
===============================================
# Limit operator is used for limiting the number of tuples from the relation as an
output
dump limit_data;