/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.asp.tranlog;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.util.Base64;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Mapper;
import com.asp.tranlog.ImportTsv.TsvParser.BadTsvLineException;
/**
* Write table content out to files in hdfs.
*/
public class TsvImporterMapper extends
Mapper<LongWritable, Text, ImmutableBytesWritable, Put> {
private static final Log LOG = LogFactory.getLog(TsvImporterMapper.class);
/** Timestamp for all inserted rows */
private long ts;
/** Column seperator */
private String separator;
/** Should skip bad lines */
private boolean skipBadLines;
private Counter badLineCount;
private int[] keyColIndex = null;// The column index that will be used to
// compose to a row key like
// aaaa+bbb+ccc
private int[] keyColLen = null;
private byte[] columnTypes = null;
private char[] colDatetimeFormater = null;// For columns with type
// 'datetime', the formater will
// be saved in this array.
private String charset;
private String hbase_rowkey_separator;
public final static SimpleDateFormat[] datetimeParsers = {
new SimpleDateFormat("MMM dd yyyy hh:mm:ss:SSSaa",
new java.util.Locale("en")),// Dec 7 2012 3:35:30:453PM
new SimpleDateFormat("yyyyMMdd", new java.util.Locale("en")) };
private ImportTsv.TsvParser parser;
public long getTs() {
return ts;
}
public boolean getSkipBadLines() {
return skipBadLines;
}
public Counter getBadLineCount() {
return badLineCount;
}
public void incrementBadLineCount(int count) {
this.badLineCount.increment(count);
}
/**
* Handles initializing this class with objects specific to it (i.e., the
* parser). Common initialization that might be leveraged by a subsclass is
* done in <code>doSetup</code>. Hence a subclass may choose to override
* this method and call <code>doSetup</code> as well before handling it's
* own custom params.
*
* @param context
*/
@Override
protected void setup(Context context) {
doSetup(context);
Configuration conf = context.getConfiguration();
charset = conf.get(ImportTsv.CHARSET_CONF_KEY);
parser = new ImportTsv.TsvParser(conf.get(ImportTsv.COLUMNS_CONF_KEY),
conf.getStrings(ImportTsv.KEYCOLUMNS_CONF_KEY), separator);
keyColIndex = parser.getRowKeyColumnIndex();
keyColLen = parser.getRowKeyColumnLen();
if (keyColIndex == null) {
throw new RuntimeException("No row key column specified");
}
columnTypes = parser.getColType();
if (columnTypes != null) {
colDatetimeFormater = new char[columnTypes.length];
for (int i = 0; i < columnTypes.length; i++)
colDatetimeFormater[i] = 0;
}
}
/**
* Handles common parameter initialization that a subclass might want to
* leverage.
*
* @param context
*/
protected void doSetup(Context context) {
Configuration conf = context.getConfiguration();
// If a custom separator has been used,
// decode it back from Base64 encoding.
separator = conf.get(ImportTsv.SEPARATOR_CONF_KEY);
if (separator == null) {
separator = ImportTsv.DEFAULT_SEPARATOR;
} else {
separator = new String(Base64.decode(separator));
}
hbase_rowkey_separator = conf.get(ImportTsv.SEPARATOR_CONF_ROWKEY);
if (hbase_rowkey_separator == null
|| hbase_rowkey_separator.trim().length() == 0) {
hbase_rowkey_separator = "";
} else {
hbase_rowkey_separator = new String(
Base64.decode(hbase_rowkey_separator));
}
ts = conf.getLong(ImportTsv.TIMESTAMP_CONF_KEY,
System.currentTimeMillis());
skipBadLines = context.getConfiguration().getBoolean(
ImportTsv.SKIP_LINES_CONF_KEY, true);
badLineCount = context.getCounter("ImportTsv", "Bad Lines");
}
/**
* To find a date parser from the datetimeParsers array
*
* @return
*/
protected Date parseTimestamp(byte[] byteVal, int colIdx)
throws ParseException {
Date rtnDate = null;
String dateString = Bytes.toString(byteVal);
if (colDatetimeFormater != null && colDatetimeFormater.length > colIdx) {
int fmtrIdx = colDatetimeFormater[colIdx];
try {
rtnDate = datetimeParsers[fmtrIdx].parse(dateString);
} catch (java.text.ParseException e) {
}
if (rtnDate == null) {
for (int i = 0; i < datetimeParsers.length; i++) {
try {
rtnDate = datetimeParsers[i].parse(dateString);
} catch (java.text.ParseException e) {
}
if (rtnDate != null) {
colDatetimeFormater[colIdx] = (char) i;
break;
}
}
}
}
if (rtnDate == null) {
LOG.error("No supported data format found: " + dateString);
throw new ParseException("Failed to parse date: " + dateString, 0);
}
return rtnDate;
}
/**
* Extract byte array for column specified by colIdx.
*
* @param lineBytes
* @param parsed
* @param colIdx
* @return
*/
protected byte[] getInputColBytes(byte[] lineBytes,
ImportTsv.TsvParser.ParsedLine parsed, int colIdx) {
if (colIdx >= columnTypes.length)
return null;
int colOffset = parsed.getColumnOffset(colIdx);
int colLen = parsed.getColumnLength(colIdx);
byte[] colBytes = new byte[colLen];
Bytes.putBytes(colBytes, 0, lineBytes, colOffset, colLen);
return colBytes;
}
/**
* To create rowkey byte array, the rule is like this: row key can be
* composed by several columns change every columns values to String, if
* column type is date, change to long first if column values are "kv1 ",
* "kv2", " kv3", ... then the row key string will be "kv1 +kv2+ kv3",
* that means the space char will be kept
*
* @param lineBytes
* @param parsed
* @return
* @throws BadTsvLineException
*/
protected byte[] createRowkeyByteArray(byte[] lineBytes,
ImportTsv.TsvParser.ParsedLine parsed) throws BadTsvLineException {
try {
byte[] colBytes = null;
Date tmpDate = null;
StringBuffer sb = new StringBuffer();
for (int i = 0; i < keyColIndex.length; i++) {
if (i > 0 && hbase_rowkey_separator.length() > 0)
sb.append(hbase_rowkey_separator);
colBytes = getInputColBytes(lineBytes, parsed, keyColIndex[i]);
if (colBytes == null)
throw new BadTsvLineException(
"Failed to get column bytes for " + keyColIndex[i]);
String rowCol;
if (columnTypes[keyColIndex[i]] == ImportTsv.COL_TYPE_DATETIME) {
tmpDate = parseTimestamp(colBytes, keyColIndex[i]);
rowCol = Long.toString(tmpDate.getTime());
sb.append(rowCol);
} else if (columnTypes[keyColIndex[i]] == ImportTsv.COL_TYPE_STRING) {
// String lineStr = new String(value.getBytes(), 0,
// value.getLength(), "gb18030");
// byte[] lineBytes = new Text(lineStr).getBytes();
if (StringUtils.isEmpty(charset))
charset = HConstants.UTF8_ENCODING;
String lineStr = new String(colBytes, charset);
colBytes = new Text(lineStr).getBytes();
rowCol = Bytes.toString(colBytes);
// if original string len < specified string len, then use
// substring, else using space to right pad.
if (keyColLen[i] != 0 && rowCol.length() > keyColLen[i])
sb.append(rowCol.substring(0, keyColLen[i]));
else
sb.append(StringUtils.rightPad(rowCol, keyColLen[i]));
} else if (columnTypes[keyColIndex[i]] == ImportTsv.COL_TYPE_INT) {
int intVal = Integer.parseInt(Bytes.toString(colBytes));
rowCol = Integer.toString(intVal);
sb.append(StringUtils.leftPad(rowCol, keyColLen[i], '0'));
} else if (columnTypes[keyColIndex[i]] == ImportTsv.COL_TYPE_DOUBLE) {
double dbval = Double.parseDouble(Bytes.toString(colBytes));
rowCol = Double.toString(dbval);
sb.append(rowCol);
} else if (columnTypes[keyColIndex[i]] == ImportTsv.COL_TYPE_LONG) {
long longVal = Long.parseLong(Bytes.toString(colBytes));
rowCol = Long.toString(longVal);
sb.append(StringUtils.leftPad(rowCol, keyColLen[i], '0'));
} else {
rowCol = Bytes.toString(colBytes);
// if original string len < specified string len, then use
// substring, else using space to right pad.
if (keyColLen[i] != 0 && rowCol.length() > keyColLen[i])
sb.append(rowCol.substring(0, keyColLen[i]));
else
sb.append(StringUtils.rightPad(rowCol, keyColLen[i]));
}
}
return sb.toString().getBytes();
} catch (Exception e) {
throw new BadTsvLineException(e.getMessage());
}
}
/**
*
* @param lineBytes
* @param parsed
* @param colIdx
* @return
*/
protected byte[] convertColBytes(byte[] lineBytes,
ImportTsv.TsvParser.ParsedLine parsed, int colIdx)
throws BadTsvLineException {
byte[] rtn = null;
byte[] srcBytes = getInputColBytes(lineBytes, parsed, colIdx);
try {
if (columnTypes[colIdx] == ImportTsv.COL_TYPE_DATETIME) {
Date tmpDate = parseTimestamp(srcBytes, colIdx);
;
rtn = Bytes.toBytes(tmpDate.getTime());
} else if (columnTypes[colIdx] == ImportTsv.COL_TYPE_INT) {
int intVal = Integer.parseInt(Bytes.toString(srcBytes));
rtn = Bytes.toBytes(intVal);
} else if (columnTypes[colIdx] == ImportTsv.COL_TYPE_DOUBLE) {
double dbval = Double.parseDouble(Bytes.toString(srcBytes));
rtn = Bytes.toBytes(dbval);
} else if (columnTypes[colIdx] == ImportTsv.COL_TYPE_LONG) {
long longVal = Long.parseLong(Bytes.toString(srcBytes));
rtn = Bytes.toBytes(longVal);
} else {
rtn = srcBytes;
}
} catch (Exception e) {
throw new BadTsvLineException(e.getMessage());
}
return rtn;
}
/**
* Convert a line of TSV text into an HBase table row.
*/
@Override
public void map(LongWritable offset, Text value, Context context)
throws IOException {
byte[] lineBytes = value.getBytes();
// String lineStr = new String(value.getBytes(), 0, value.getLength(),
// "gb18030");
// byte[] lineBytes = new Text(lineStr).getBytes();
int i = 0;
try {
ImportTsv.TsvParser.ParsedLine parsed = parser.parse(lineBytes,
value.getLength());
// ImportTsv.TsvParser.ParsedLine parsed = parser.parse(
// lineBytes, Text.utf8Length(lineStr));
byte[] rowKeyBytes = createRowkeyByteArray(lineBytes, parsed);
ImmutableBytesWritable rowKey = new ImmutableBytesWritable(
rowKeyBytes);
Put put = new Put(rowKeyBytes);
put.setWriteToWAL(false);
for (i = 0; i < parsed.getColumnCount(); i++) {
KeyValue kv = null;
if (columnTypes[i] == ImportTsv.COL_TYPE_STRING) {
kv = new KeyValue(rowKeyBytes, parser.getFamily(i),
parser.getQualifier(i), 0,
parser.getQualifier(i).length, ts,
KeyValue.Type.Put, lineBytes,
parsed.getColumnOffset(i),
parsed.getColumnLength(i));
} else {
byte[] colBytes = convertColBytes(lineBytes, parsed, i);
if (colBytes == null)
throw new ImportTsv.TsvParser.BadTsvLineException(
"Failed to get bytes for column " + i);
kv = new KeyValue(rowKeyBytes, parser.getFamily(i),
parser.getQualifier(i), ts, colBytes);
}
if (kv == null)
throw new ImportTsv.TsvParser.BadTsvLineException(
"Failed to get bytes for column " + i);
put.add(kv);
}
context.write(rowKey, put);
} catch (ImportTsv.TsvParser.BadTsvLineException badLine) {
if (skipBadLines) {
System.err.println("Bad line: "
+ new String(lineBytes, "gb18030") + ":" + i + "\n");
LOG.error("Bad line: " + new String(lineBytes, "gb18030") + ","
+ i);
incrementBadLineCount(1);
return;
} else {
throw new IOException(badLine);
}
} catch (IllegalArgumentException e) {
if (skipBadLines) {
System.err.println("Bad line: "
+ new String(lineBytes, "gb18030") + ":" + i + "\n");
LOG.error("Bad line: " + new String(lineBytes, "gb18030") + ","
+ i);
incrementBadLineCount(1);
return;
} else {
throw new IOException(e);
}
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}