Wednesday, 14 August 2013

BULK INSTERT INTO HBASE


Below script is used for inserting bulk data into Hbase using import method. 
Script considerations :- 

1) INPUT_file.tsv separated by tab 

2) Proper environment variables set 

Enviroment setup 

[  nitin@nitin-R15: ~ ]$ vim  .bashrc
export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$HBASE_HOME/lib/guava-<version>.jar
[  nitin@nitin-R15: ~ ]$ source .bashrc 


#!/bin/bash


INPUT_FILE="<INPUT_file.tsv>"

head -n 1 ${INPUT_FILE}> schema.txt
sed -i -e 's/\t/,/g' schema.txt
sed -i -e 's/,/,cf1:/g' schema.txt
sed -i -e 's/^/cf1:/g' schema.txt

HADOOP_LOC_FILE=$(basename ${INPUT_FILE})

DATA_COMPUTATION_FILE=$(sed -i -e '1d' ${INPUT_FILE})
COLUMN_FAMILY_NAME="cf1"
HADOOP_BIN="sudo -E -u hdfs hadoop "
TABLE_NAME="NT_AMAZON"
INPUT_DIR_PATH="/user/input/"
JAR_NAME="/usr/lib/hbase/hbase.jar"
CF_FINAL_NAME=$(cat schema.txt)

if [ ! -f ${INPUT_FILE} ]

then
        echo " ERROR :: INPUT FILE is MISSING "
        exit 2
fi

create_table ()

{
hbase shell<<_EOF_
create 'NT_AMAZON','cf1'
_EOF_
}

create_table

if [ $? -ne 0 ];
then
        echo "ERROR :: Unable to create table ${TABLE_NAME} on HBASE"
        exit 2
fi

sed -i -e 's/\t//g' ${INPUT_FILE}



${HADOOP_BIN} fs -put ${INPUT_FILE} ${INPUT_DIR_PATH}

if [ $? -ne 0 ];
then
        echo "ERROR :: Unable to copy file ${INPUT_FILE} on Hadoop"
        exit 2
fi

${HADOOP_BIN} jar ${JAR_NAME} importtsv -Dimporttsv.columns=HBASE_ROW_KEY,${CF_FINAL_NAME} '-Dimporttsv.separator='  ${TABLE_NAME} ${INPUT_DIR_PATH}${HADOOP_LOC_FILE}


if [ $? -ne 0 ];

then
        echo "ERROR :: Unable to insert data into Hbase"
        exit 2
fi

No comments:

Post a Comment

Ansible Cheat sheet

Install Ansible  # yum install ansible Host file configuration  File  [ansible@kuber2 ~]$ cat /etc/ansible/hosts     [loca...