Вы находитесь на странице: 1из 7

Q2 a) Hive solutions:

#Creating a directory in hadoop:

hdfs dfs -ls /user/

hdfs dfs -mkdir /user/seshadri

hdfs dfs -mkdir /user/seshadri/data

#Storing the files on hadoop:

hdfs dfs -put u.data /user/Seshadri/data/

hdfs dfs -put u.item /user/Seshadri/data/

hdfs dfs -put u.user /user/Seshadri/data/

hdfs dfs -ls /user/Seshadri/data

#Starting Hive session:

hive

#Creating Hive tables:

create table ratings(user_id INT, item_id INT , rating INT , timestamp INT)

row format delimited

fields terminated by “|”;

load data inpath ‘/user/Seshadri/data/u.data’ overwrite into table ratings;

create table movies(movie_id INT , movie_title STRING, release_date STRING ,video release date
STRING, IMDb_URL STRING , unknown INT, Action INT, Adventure INT, Animation INT, Children INT ,
Comedy INT, Crime INT, Documentry INT, Drama INT, Fantasy INT, Film_Noir INT, Horror INT, Musical
INT, Mystery INT, Romance INT, Sci-Fi INT, Thriller INT , War INT, Western INT)

row format delimited

fields terminated by “| \t”

load data inpath ‘/user/Seshadri/data/u.item’ overwrite into table movies;

create table user(user_id INT, age INT, gender STRING, occupation STRING, zip code STRING)
row format delimited

fields terminated by “|”

load data inpath ‘/user/Seshadri/data/u.user ‘ overwrite into table user;

#The files has been loaded as tables in HIVE datawarehouse

hdfs dfs -ls /user/hive/warehouse

Q2 b)

i) Top 30 most rated movies

Create table most_rated_movies as

Select a.user_id,b.movie_id,b.movie_title,a.rating from ratings a left join movies b on


(a.item_id=b.movie_id) ;

Create table top30_rated_movies as

Select movie_title,count(movie_title) as rating_count from most_rated_movies group by movie_title

Order by rating_count desc limit 30;

select * from top30_rated_movies;

ii) Top 30 Highest rated movies

create table highest_rated_movies as

select movie_title,avg(rating) as avg_rating from most_rated_movies group by movie_title order by


avg_rating desc limit 30;

select * from highest_rated_movies;

iii) Top 30 movies having atleast 50,20 and 100 raters with their average rating

atleast 50 users:
create table t1 as

select movie_title, count(distinct(user_id)) as count_user, avg(rating) as avg_rating from


most_rated_movies group by movie_title having count_user >=50 order by avg_rating desc limit 30;

select * from t1;

#atleast 20 users:

create table t2 as

select movie_title, count(distinct(user_id)) as count_user, avg(rating) as avg_rating from


most_rated_movies group by movie_title having count_user >=20 order by avg_rating desc limit 30;

select * from t2;

#atleast 100 users:

create table t3 as

select movie_title, count(distinct(user_id)) as count_user, avg(rating) as avg_rating from


most_rated_movies group by movie_title having count_user >=100 order by avg_rating desc limit 30;

select * from t3;

Q2 c)

# Joining the ratings table and the user table

create table t4 as

select a.user_id,b.gender,b.occupation,a.item_id,a.rating from ratings a left join user b on


(a.user_id=b.user_id );

#Joining table4 with movies table

create table t5 as

select a.*,b.movie_title,( case

when b.unknown = 1 then "Unknown"

else " "

end) as Unknown,
( case

when b.Action = 1 then "Action"

else " "

end) as Action,

( case

when b.Adventure = 1 then "Adventure"

else " "

end) as Adventure,

( case

when b.Animation = 1 then "Animation"

else " "

end) as Animation,

( case

when b.Children = 1 then "Children"

else " "

end) as Children,

( case

when b.Comedy = 1 then "Comedy"

else " "

end) as Comedy,

( case

when b.Crime = 1 then "Crime"

else " "

end) as Crime,

( case

when b.Documentry = 1 then "Documentry"

else " "

end) as Documentry,

( case
when b.Drama = 1 then "Drama"

else " "

end) as Drama,

( case

when b.Fantasy = 1 then "Fantasy"

else " "

end) as Fantasy,

( case

when b.Film_Noir = 1 then "Film_Noir"

else " "

end) as Film_Noir,

( case

when b.Horror = 1 then "Horror"

else " "

end) as Horror,

( case

when b.Musical = 1 then "Musical"

else " "

end) as Musical,

( case

when b.Mystery = 1 then "Mystery"

else " "

end) as Mystery,

( case

when b.Romance = 1 then "Romance"

else " "

end) as Romance,

( case
when b.Sci_Fi = 1 then "Sci-Fi"

else " "

end) as Sci_Fi,

( case

when b.Thriller = 1 then "Thriller"

else " "

end) as Thriller,

( case

when b.War = 1 then "War"

else " "

end) as War,

( case

when b.Western = 1 then "Western"

else " "

end) as Western

from t4 a left join movies b on (a.item_id=b.movie_id);

# Average rating per gender:

create table t6 as

select gender , avg(rating) as avg_rating from t5 group by gender;

Q2 c) a)

#Creating a column column for genre by concatenating the all the genres.

create table t7 as

select *, Concat(TRIM(unknown),TRIM(Action),TRIM(Adventure) , TRIM(Animation) , TRIM(Children) ,


TRIM(Comedy) , TRIM(Crime) , TRIM(Documentry) , TRIM(Drama) , TRIM(Fantasy) , TRIM(Film_Noir) ,
TRIM(Horror) , TRIM(Musical) , TRIM(Mystery) , TRIM(Romance) , TRIM(Sci_Fi) , TRIM(Thriller) ,
TRIM(War) , TRIM(Western)) as Genre from t5;
#The average rating per gender and genre

#For Males

create table t8 as

select gender,Genre,avg(rating) as avg_rating from t7 where gender ="M" group by gender,Genre order
by avg_rating desc limit 30;

#For females

create table t9 as

select gender,Genre,avg(rating) as avg_rating from t7 where gender ="F" group by gender,Genre order
by avg_rating desc limit 30;

Q2 d) The average rating given by users from each occupations

create table t10 as

select occupation,avg(rating) as avg_rating from t4 group by occupation order by avg_rating desc ;

Q2 e) The count of ratings and average rating of each genre.

create table t11 as

select Genre,count(rating) as count_rating,avg(rating) as avg_rating from t7 group by Genre order by


avg_rating desc ;