Hive Fresco Play Hands on Solution Hacker Rank

Hive Fresco Play Hands on Learn advance data processing concept Hive Data Model, DDL, DML, SQL, JOINs, Partitioning, Bucketing, User Define Functions.
Hive Fresco Play Hands on Solution Hacker Rank - www.pdfcup.com

Lab 1: Welcome to Hive Hands-on - Creating Hive tables.

Solution 1: Creating Hive tables.




# Open HIVE in Terminal.
user1@kq9ca59d26w:/projects/challenge$ hive

-- Task 1: Create database named 'hive_tables'.
hive > create database hive_tables;

hive > use hive_tables;

--  Task 2: Under hive_tables database, create a managed table named bank_internal. Get the column names for the tables from the CSV file. Make sure to skip the header while loading the data into a table.

hive > create table bank_internal(age INT,job STRING,marital STRING,balance INT,loan STRING,contact STRING,month STRING,duration INT) row format delimited fields terminated by "," lines terminated BY '\n'  tblproperties('skip.header.line.count'='1') ;

--  Task 3: Load the CSV data into bank_internal table.
hive > load data local inpath '/projects/challenge/bank.csv' into table bank_internal;

-- Task 4: Create directory named '/bank_external' in hdfs. ( Hint: Open new Terminal and run this query).
user1@kq9ca59d26w:/projects/challenge$ hdfs dfs -mkdir -p /bank_external

--  Task 5: Create an external table named bank_external and set the location as '/bank_external'.  Get the column names for the table from the CSV file.  Make sure to skip the header while loading the data into a table.

hive > create external table bank_external(age INT,job STRING,marital STRING,balance INT,loan STRING,contact STRING,month STRING,duration INT) row format delimited fields terminated by "," lines terminated BY '\n'  LOCATION '/bank_external' tblproperties('skip.header.line.count'='1') ;

-- Task 6: Copy the bank.csv file to bank_external hdfs.
hive > load  data local inpath '/projects/challenge/bank.csv' into table bank_external ;

Lab 2: Welcome to Hive Handson - Insert the data.

Solution 2: Insert the data.




-- Open New Terminal and type 'hive'.

user@x916p8e7134d:/projects/challenge$ hive

-- Task 1: Create a database named hive_insert.

hive > create database hive_insert;
hive > use hive_insert;

-- Task 2: Under 'hive_insert' database, create a table named 'bank'. Get the column names for the table from the CSV file. 
-- Make sure to skip the header while loading the data into a table.

hive > create table bank(age INT,job STRING,marital STRING,balance DOUBLE,loan STRING,contact STRING,month STRING,duration INT) row format delimited fields terminated by "," lines terminated BY '\n'  tblproperties('skip.header.line.count'='1') ;

-- Task 3:  Load the bank.csv file into bank table.

hive > load data local inpath '/projects/challenge/bank.csv' into table bank;

-- Task 4: Create a new table named 'bank1' with the following columns: "age", "job", "balance", "month", "duration".

hive > create table bank1(age INT, job STRING, balance DOUBLE, month STRING, duration INT);


-- Task 5: From the bank table, fetch the records which is having age greater than 30 and duration greater than 300 and laod the data to bank1 table.

hive > insert into  bank1(age, job ,balance ,month ,duration )  select age, job, balance, month,duration from bank where age>30 and duration>300;

-- Task 6: Load the below records into bank1 table.
--(32,'technician', 900, 'mar', 400), 
--(33,'admin',900, 'may', 500 ), 
--(35,'services', 400, "jul",400 ),
--(36,'blue-collar', 600, "aug",600 ),
--(38,'technician', 800, "sep",700 )

hive > insert into bank1 values(32,'technician', 900, 'mar', 400), (33,'admin',900, 'may', 500 ), (35,'services', 400, "jul",400 ), (36,'blue-collar', 600, "aug",600 ),(38,'technician', 800, "sep",700 );

Lab 3: Welcome to Hive Handson - Query the data.

Solution 3: Query the data.




-- Open new Terminal and type 'hive'.
user@z9kv8w2762x:/projects/challenge$  hive

-- Task 1: Create a database named hive_query.
hive > create database hive_query;
hive > use hive_query;

-- Task 2: Under hive_query database, create a table named 'bank'. Get the column names for the table from the CSV file. Make sure to skip the header while loading the data into a table.

hive > create table bank(age INT,job STRING,marital STRING,balance DOUBLE,loan STRING,contact STRING,month STRING,duration INT) row format delimited fields terminated by "," lines terminated BY '\n'  tblproperties('skip.header.line.count'='1') ;

-- Task 3: Load the bank.csv file into bank table.
hive > load data local inpath '/projects/challenge/bank.csv' into table bank;

-- Task 4: Open new Terminal and create a directory "output1".
user@z9kv8w2762x:/projects/challenge$  hadoop fs -mkdir -p /output1

-- Task 5: Fetch the records which have job = 'technician', age less than 60 and months in 'jan', 'oct', 'jul'. Save the output in hdfs directory '/output1'.

hive > insert overwrite directory '/output1' row format delimited fields terminated by ','  select * from bank where job ='technician' and age<60 and month in("jan", "oct","jul");

-- Task 6: Open New Terminal and create a directory "output2".
user@z9kv8w2762x:/projects/challenge$  hadoop fs -mkdir -p /output2

-- Task 7: Get the count of the records based on job column. Save the output in hdfs directory "/output2".
hive > insert overwrite directory '/output2' row format delimited fields terminated by ','  select count(*) from bank group by job;


-- Task 8: Open New Terminal and create a directory "output3".
user@z9kv8w2762x:/projects/challenge$  hadoop fs -mkdir -p /output3

-- Task 9: Get maximum duration based on job column and save the output in  hdfs  directory "/output3".
insert overwrite directory '/output3' row format delimited fields terminated by ','  select max(duration), job from bank group by job;

Lab 4: Welcome to Hive Handson - Partition table.

Solution 4: Partition table.





-- Open new Terminal and type "hive".
user@t4n19po6f891k:/projects/challenge$  hive

-- Task 1: Create a database named 'hive_part'.
hive > create database hive_part;

-- Task 2: Create a temporary table and load the transaction.csv records into the table. (Use string datatype for the tran_date column).

hive > CREATE TEMPORARY TABLE tmp1(transaction_id STRING ,cust_id INT,tran_date STRING,prod_subcat_code INT,prod_cat_code INT ,Qty INT,Rate INT,Tax DOUBLE,total_amt DOUBLE,Store_type STRING)   row format delimited fields terminated by "," lines terminated by '\n';

hive > load data local inpath '/projects/challenge/Transactions.csv' into table tmp1;

hive > set hive.exec.dynamic.partition=true;
hive > set hive.exec.dynamic.partition.mode=nonstrict;

-- Task 3: Under the hive_part database create a table named transaction_part with the following columns:
-- (transaction_id cust_id ,  tran_date  Qty ,Rate ,Tax ,total_amt, Store_type ). The partition column should be Store_type.

hive > create table transaction_part(transaction_id STRING, cust_id INT ,  tran_date  STRING,  Qty INT,Rate INT,Tax DOUBLE,total_amt DOUBLE ) partitioned by ( Store_type STRING) row format delimited fields terminated by ',';


-- Task 4: Fetch the records from temporary table where the 'Qty' is greater than 2 and do  a dynamic insert in to transaction_part table.

hive > INSERT OVERWRITE TABLE transaction_part PARTITION ( Store_type ) SELECT transaction_id ,cust_id ,  tran_date  ,Qty ,Rate ,Tax ,total_amt, Store_type   FROM tmp1 where Qty>2;

-- Task 5: Open a new Terminal and create a new hdfs directory "/store".
user@t4n19po6f891k:/projects/challenge$  hdfs dfs -mkdir -p /store

-- Task 6: Fetch the records from transaction_part table where the store type is "e-Shop" and store the result in hdfs  directory named '/store'. The fields should be terminated by ','.
 
hive > insert overwrite directory '/store' row format delimited fields terminated by ','  select * from transaction_part where Store_type  = 'e-Shop';

Lab 5: Welcome to Hive Handson - Bucketing table.

Solution 5: Bucketing table.




-- Open a Terminal and type "hive".
user1@kq9ca59d26w:/projects/challenge$ hive

hive > create database hive_bucket;
hive > use hive_bucket;

-- Task 2: Create a temporary table and load the transaction.csv records into the table. (Use string datatype fro the tran_date column).

hive >  CREATE TEMPORARY TABLE tmp1(transaction_id STRING ,cust_id INT,tran_date STRING,prod_subcat_code INT,prod_cat_code INT ,Qty INT,Rate INT,Tax DOUBLE,total_amt DOUBLE,Store_type STRING)   row format delimited fields terminated by "," lines terminated by '\n';

hive > load data local inpath '/projects/challenge/Transactions.csv' into table tmp1;

hive > set hive.exec.dynamic.partition=true;
hive > set hive.exec.dynamic.partition.mode=nonstrict;

-- Task 3: Under 'hive_bucket' database, create a table named transaction_bucket with the following columns:  
-- (transaction_id ,cust_id ,  tran_date  ,Qty ,Rate ,Tax ,total_amt, Store_type ). Partition should be based on Store_type and  Clustered by cust_id columns with 3 buckets.

hive >  create table transaction_bucket(transaction_id STRING, cust_id INT ,  tran_date  STRING,  Qty INT,Rate INT,Tax DOUBLE,total_amt DOUBLE ) partitioned by ( Store_type STRING) clustered by (cust_id) into 3 buckets row format delimited fields terminated by ',';

-- Task 4: Fetch the records from temporary table where the 'Qty' is greater than 0 and do a dynamic insert in to transaction_bucket  table.

hive >  INSERT OVERWRITE TABLE transaction_bucket PARTITION ( Store_type ) SELECT transaction_id ,cust_id ,  tran_date  ,Qty ,Rate ,Tax ,total_amt, Store_type   FROM tmp1 where Qty>0;


-- Task 5:  Run the following commands in the hive shell: 

hive > SET hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
hive > SET hive.support.concurrency=true;
hive > SET hive.exec.dynamic.partition.mode=nonstrict;
hive > SET hive.compactor.initiator.on=true;
hive > SET hive.compactor.worker.threads=1;

-- Task 6: Create a new table named bucket1 with the following columns: 
-- (transaction_id ,  cust_id  , tran_date ,  Qty ,Rate ,Tax , total_amt , Store_type ). Clustered by cust_id with 3 buckets, stored type should be 'ORC' and set transational is true in table properties.

hive >  create table bucket1(transaction_id STRING,  cust_id INT , tran_date  STRING,  Qty INT,Rate INT,Tax DOUBLE,total_amt DOUBLE, Store_type STRING ) clustered by (cust_id) into 3 buckets row format delimited fields terminated by ','   STORED AS ORC TBLPROPERTIES ('transactional'='true')  ;



-- Task 7: Insert the below values into bucket1 table, 

hive > insert into bucket1 values (  "80712190438",270351,"28-02-2014",-5,-772,405.3,-4265.3,"e-Shop"),
( "29258453508",270384,"27-02-2014",-5,-1497,785.925,-8270.925,"e-Shop"),
( "93274880719",271509,"24-02-2014",-3,-1363,429.345,-4518.345,"e-Shop"),
( "97439039119",272357,"23-02-2014",-2,-824,173.04, -1821.04, "TeleShop"),
( "45649838090",273667,"22-02- 2014",-1,-1450,152.25,-1602.25, "e-Shop"),
( "22643667930",271489,"22-02-2014",-1,-1225,128.625,-1353.625,"TeleShop"),
( "79792372943",275108,"22-02- 2014",-3,-908,286.02,-3010.02,"MBR"),
( "50076728598",269014,"21-02- 2014",-4,-581,244.02,-2568.02,"e-Shop");

-- Task 8: Set the 'tran_data' as '25-02-2014' for the 'e-Shop' store type.

hive > update bucket1 set tran_date='25-02-2014' where Store_type='e-Shop';

About the author

D Shwari
I'm a professor at National University's Department of Computer Science. My main streams are data science and data analysis. Project management for many computer science-related sectors. Next working project on Al with deep Learning.....

Post a Comment