Lab 1: Welcome to Hive Hands-on - Creating Hive tables.
Solution 1: Creating Hive tables.
# Open HIVE in Terminal.
user1@kq9ca59d26w:/projects/challenge$ hive
-- Task 1: Create database named 'hive_tables'.
hive > create database hive_tables;
hive > use hive_tables;
-- Task 2: Under hive_tables database, create a managed table named bank_internal. Get the column names for the tables from the CSV file. Make sure to skip the header while loading the data into a table.
hive > create table bank_internal(age INT,job STRING,marital STRING,balance INT,loan STRING,contact STRING,month STRING,duration INT) row format delimited fields terminated by "," lines terminated BY '\n' tblproperties('skip.header.line.count'='1') ;
-- Task 3: Load the CSV data into bank_internal table.
hive > load data local inpath '/projects/challenge/bank.csv' into table bank_internal;
-- Task 4: Create directory named '/bank_external' in hdfs. ( Hint: Open new Terminal and run this query).
user1@kq9ca59d26w:/projects/challenge$ hdfs dfs -mkdir -p /bank_external
-- Task 5: Create an external table named bank_external and set the location as '/bank_external'. Get the column names for the table from the CSV file. Make sure to skip the header while loading the data into a table.
hive > create external table bank_external(age INT,job STRING,marital STRING,balance INT,loan STRING,contact STRING,month STRING,duration INT) row format delimited fields terminated by "," lines terminated BY '\n' LOCATION '/bank_external' tblproperties('skip.header.line.count'='1') ;
-- Task 6: Copy the bank.csv file to bank_external hdfs.
hive > load data local inpath '/projects/challenge/bank.csv' into table bank_external ;
Lab 2: Welcome to Hive Handson - Insert the data.
Solution 2: Insert the data.
-- Open New Terminal and type 'hive'.
user@x916p8e7134d:/projects/challenge$ hive
-- Task 1: Create a database named hive_insert.
hive > create database hive_insert;
hive > use hive_insert;
-- Task 2: Under 'hive_insert' database, create a table named 'bank'. Get the column names for the table from the CSV file.
-- Make sure to skip the header while loading the data into a table.
hive > create table bank(age INT,job STRING,marital STRING,balance DOUBLE,loan STRING,contact STRING,month STRING,duration INT) row format delimited fields terminated by "," lines terminated BY '\n' tblproperties('skip.header.line.count'='1') ;
-- Task 3: Load the bank.csv file into bank table.
hive > load data local inpath '/projects/challenge/bank.csv' into table bank;
-- Task 4: Create a new table named 'bank1' with the following columns: "age", "job", "balance", "month", "duration".
hive > create table bank1(age INT, job STRING, balance DOUBLE, month STRING, duration INT);
-- Task 5: From the bank table, fetch the records which is having age greater than 30 and duration greater than 300 and laod the data to bank1 table.
hive > insert into bank1(age, job ,balance ,month ,duration ) select age, job, balance, month,duration from bank where age>30 and duration>300;
-- Task 6: Load the below records into bank1 table.
--(32,'technician', 900, 'mar', 400),
--(33,'admin',900, 'may', 500 ),
--(35,'services', 400, "jul",400 ),
--(36,'blue-collar', 600, "aug",600 ),
--(38,'technician', 800, "sep",700 )
hive > insert into bank1 values(32,'technician', 900, 'mar', 400), (33,'admin',900, 'may', 500 ), (35,'services', 400, "jul",400 ), (36,'blue-collar', 600, "aug",600 ),(38,'technician', 800, "sep",700 );
Lab 3: Welcome to Hive Handson - Query the data.
Solution 3: Query the data.
-- Open new Terminal and type 'hive'.
user@z9kv8w2762x:/projects/challenge$ hive
-- Task 1: Create a database named hive_query.
hive > create database hive_query;
hive > use hive_query;
-- Task 2: Under hive_query database, create a table named 'bank'. Get the column names for the table from the CSV file. Make sure to skip the header while loading the data into a table.
hive > create table bank(age INT,job STRING,marital STRING,balance DOUBLE,loan STRING,contact STRING,month STRING,duration INT) row format delimited fields terminated by "," lines terminated BY '\n' tblproperties('skip.header.line.count'='1') ;
-- Task 3: Load the bank.csv file into bank table.
hive > load data local inpath '/projects/challenge/bank.csv' into table bank;
-- Task 4: Open new Terminal and create a directory "output1".
user@z9kv8w2762x:/projects/challenge$ hadoop fs -mkdir -p /output1
-- Task 5: Fetch the records which have job = 'technician', age less than 60 and months in 'jan', 'oct', 'jul'. Save the output in hdfs directory '/output1'.
hive > insert overwrite directory '/output1' row format delimited fields terminated by ',' select * from bank where job ='technician' and age<60 and month in("jan", "oct","jul");
-- Task 6: Open New Terminal and create a directory "output2".
user@z9kv8w2762x:/projects/challenge$ hadoop fs -mkdir -p /output2
-- Task 7: Get the count of the records based on job column. Save the output in hdfs directory "/output2".
hive > insert overwrite directory '/output2' row format delimited fields terminated by ',' select count(*) from bank group by job;
-- Task 8: Open New Terminal and create a directory "output3".
user@z9kv8w2762x:/projects/challenge$ hadoop fs -mkdir -p /output3
-- Task 9: Get maximum duration based on job column and save the output in hdfs directory "/output3".
insert overwrite directory '/output3' row format delimited fields terminated by ',' select max(duration), job from bank group by job;
Lab 4: Welcome to Hive Handson - Partition table.
Solution 4: Partition table.
-- Open new Terminal and type "hive".
user@t4n19po6f891k:/projects/challenge$ hive
-- Task 1: Create a database named 'hive_part'.
hive > create database hive_part;
-- Task 2: Create a temporary table and load the transaction.csv records into the table. (Use string datatype for the tran_date column).
hive > CREATE TEMPORARY TABLE tmp1(transaction_id STRING ,cust_id INT,tran_date STRING,prod_subcat_code INT,prod_cat_code INT ,Qty INT,Rate INT,Tax DOUBLE,total_amt DOUBLE,Store_type STRING) row format delimited fields terminated by "," lines terminated by '\n';
hive > load data local inpath '/projects/challenge/Transactions.csv' into table tmp1;
hive > set hive.exec.dynamic.partition=true;
hive > set hive.exec.dynamic.partition.mode=nonstrict;
-- Task 3: Under the hive_part database create a table named transaction_part with the following columns:
-- (transaction_id cust_id , tran_date Qty ,Rate ,Tax ,total_amt, Store_type ). The partition column should be Store_type.
hive > create table transaction_part(transaction_id STRING, cust_id INT , tran_date STRING, Qty INT,Rate INT,Tax DOUBLE,total_amt DOUBLE ) partitioned by ( Store_type STRING) row format delimited fields terminated by ',';
-- Task 4: Fetch the records from temporary table where the 'Qty' is greater than 2 and do a dynamic insert in to transaction_part table.
hive > INSERT OVERWRITE TABLE transaction_part PARTITION ( Store_type ) SELECT transaction_id ,cust_id , tran_date ,Qty ,Rate ,Tax ,total_amt, Store_type FROM tmp1 where Qty>2;
-- Task 5: Open a new Terminal and create a new hdfs directory "/store".
user@t4n19po6f891k:/projects/challenge$ hdfs dfs -mkdir -p /store
-- Task 6: Fetch the records from transaction_part table where the store type is "e-Shop" and store the result in hdfs directory named '/store'. The fields should be terminated by ','.
hive > insert overwrite directory '/store' row format delimited fields terminated by ',' select * from transaction_part where Store_type = 'e-Shop';
Lab 5: Welcome to Hive Handson - Bucketing table.
Solution 5: Bucketing table.
-- Open a Terminal and type "hive".
user1@kq9ca59d26w:/projects/challenge$ hive
hive > create database hive_bucket;
hive > use hive_bucket;
-- Task 2: Create a temporary table and load the transaction.csv records into the table. (Use string datatype fro the tran_date column).
hive > CREATE TEMPORARY TABLE tmp1(transaction_id STRING ,cust_id INT,tran_date STRING,prod_subcat_code INT,prod_cat_code INT ,Qty INT,Rate INT,Tax DOUBLE,total_amt DOUBLE,Store_type STRING) row format delimited fields terminated by "," lines terminated by '\n';
hive > load data local inpath '/projects/challenge/Transactions.csv' into table tmp1;
hive > set hive.exec.dynamic.partition=true;
hive > set hive.exec.dynamic.partition.mode=nonstrict;
-- Task 3: Under 'hive_bucket' database, create a table named transaction_bucket with the following columns:
-- (transaction_id ,cust_id , tran_date ,Qty ,Rate ,Tax ,total_amt, Store_type ). Partition should be based on Store_type and Clustered by cust_id columns with 3 buckets.
hive > create table transaction_bucket(transaction_id STRING, cust_id INT , tran_date STRING, Qty INT,Rate INT,Tax DOUBLE,total_amt DOUBLE ) partitioned by ( Store_type STRING) clustered by (cust_id) into 3 buckets row format delimited fields terminated by ',';
-- Task 4: Fetch the records from temporary table where the 'Qty' is greater than 0 and do a dynamic insert in to transaction_bucket table.
hive > INSERT OVERWRITE TABLE transaction_bucket PARTITION ( Store_type ) SELECT transaction_id ,cust_id , tran_date ,Qty ,Rate ,Tax ,total_amt, Store_type FROM tmp1 where Qty>0;
-- Task 5: Run the following commands in the hive shell:
hive > SET hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
hive > SET hive.support.concurrency=true;
hive > SET hive.exec.dynamic.partition.mode=nonstrict;
hive > SET hive.compactor.initiator.on=true;
hive > SET hive.compactor.worker.threads=1;
-- Task 6: Create a new table named bucket1 with the following columns:
-- (transaction_id , cust_id , tran_date , Qty ,Rate ,Tax , total_amt , Store_type ). Clustered by cust_id with 3 buckets, stored type should be 'ORC' and set transational is true in table properties.
hive > create table bucket1(transaction_id STRING, cust_id INT , tran_date STRING, Qty INT,Rate INT,Tax DOUBLE,total_amt DOUBLE, Store_type STRING ) clustered by (cust_id) into 3 buckets row format delimited fields terminated by ',' STORED AS ORC TBLPROPERTIES ('transactional'='true') ;
-- Task 7: Insert the below values into bucket1 table,
hive > insert into bucket1 values ( "80712190438",270351,"28-02-2014",-5,-772,405.3,-4265.3,"e-Shop"),
( "29258453508",270384,"27-02-2014",-5,-1497,785.925,-8270.925,"e-Shop"),
( "93274880719",271509,"24-02-2014",-3,-1363,429.345,-4518.345,"e-Shop"),
( "97439039119",272357,"23-02-2014",-2,-824,173.04, -1821.04, "TeleShop"),
( "45649838090",273667,"22-02- 2014",-1,-1450,152.25,-1602.25, "e-Shop"),
( "22643667930",271489,"22-02-2014",-1,-1225,128.625,-1353.625,"TeleShop"),
( "79792372943",275108,"22-02- 2014",-3,-908,286.02,-3010.02,"MBR"),
( "50076728598",269014,"21-02- 2014",-4,-581,244.02,-2568.02,"e-Shop");
-- Task 8: Set the 'tran_data' as '25-02-2014' for the 'e-Shop' store type.
hive > update bucket1 set tran_date='25-02-2014' where Store_type='e-Shop';