Below script is used for inserting bulk data into Hbase using import method.
Script considerations :-
1) INPUT_file.tsv separated by tab
2) Proper environment variables set
Enviroment setup
[ nitin@nitin-R15: ~ ]$ vim .bashrc
export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$HBASE_HOME/lib/guava-<version>.jar
[ nitin@nitin-R15: ~ ]$ source .bashrc
#!/bin/bash
INPUT_FILE="<INPUT_file.tsv>"
head -n 1 ${INPUT_FILE}> schema.txt
sed -i -e 's/\t/,/g' schema.txt
sed -i -e 's/,/,cf1:/g' schema.txt
sed -i -e 's/^/cf1:/g' schema.txt
HADOOP_LOC_FILE=$(basename ${INPUT_FILE})
DATA_COMPUTATION_FILE=$(sed -i -e '1d' ${INPUT_FILE})
COLUMN_FAMILY_NAME="cf1"
HADOOP_BIN="sudo -E -u hdfs hadoop "
TABLE_NAME="NT_AMAZON"
INPUT_DIR_PATH="/user/input/"
JAR_NAME="/usr/lib/hbase/hbase.jar"
CF_FINAL_NAME=$(cat schema.txt)
if [ ! -f ${INPUT_FILE} ]
then
echo " ERROR :: INPUT FILE is MISSING "
exit 2
fi
create_table ()
{
hbase shell<<_EOF_
create 'NT_AMAZON','cf1'
_EOF_
}
create_table
if [ $? -ne 0 ];
then
echo "ERROR :: Unable to create table ${TABLE_NAME} on HBASE"
exit 2
fi
sed -i -e 's/\t//g' ${INPUT_FILE}
${HADOOP_BIN} fs -put ${INPUT_FILE} ${INPUT_DIR_PATH}
if [ $? -ne 0 ];
then
echo "ERROR :: Unable to copy file ${INPUT_FILE} on Hadoop"
exit 2
fi
${HADOOP_BIN} jar ${JAR_NAME} importtsv -Dimporttsv.columns=HBASE_ROW_KEY,${CF_FINAL_NAME} '-Dimporttsv.separator=' ${TABLE_NAME} ${INPUT_DIR_PATH}${HADOOP_LOC_FILE}
if [ $? -ne 0 ];
then
echo "ERROR :: Unable to insert data into Hbase"
exit 2
fi
No comments:
Post a Comment