TsvImporterMapper.java example

Explorer

learning-hadoop-master
- cdh-hbase-examples
  - src
    - com
      - embracesource
        edh
        hbase
        AggregateTest.java
        BlobStoreTest.java
        ExpressionFilterTest.java
        GroupByTest.java
        MultiRowRangeFilterTest.java
        ParallelScannerTest.java
        ReplicationTest.java
        TestGroupby.java
        table
        create
        Configure.java
        OperateTableUtil.java
        TableBuilder.java
        TestCreateTable.java
- hbase-importTsv
  - src
    - com
      - asp
        tranlog
        CreateHbaseTable.java
        CreateNewHbase.java
        Global.java
        ImportTsv.java
        TsvImporterMapper.java
- jmeter-hbase-plugins
  - src
    - main
      - java
        com
        embracesource
        hbase
        jmeter
        HTableUtil.java
        HbaseJMeter.java
        HbaseJMeterPut.java
        JMeterHTablePool.java
        SequenceKey.java
        Strings.java
    - test
      - java
        com
        embracesource
        hbase
        jmeter
        JMeterTest.java
- kettle-cassandra-plugin
  - src
    - org
      - pentaho
        cassandra
        CassandraColumnMetaData.java
        CassandraConnection.java
        di
        trans
        steps
        cassandrainput
        CassandraInput.java
        CassandraInputData.java
        CassandraInputDialog.java
        CassandraInputMeta.java
        cassandraoutput
        CassandraOutput.java
        CassandraOutputData.java
        CassandraOutputDialog.java
        CassandraOutputMeta.java
        EnterCQLDialog.java
        cassandrasstableoutput
        SSTableOutput.java
        SSTableOutputData.java
        SSTableOutputDialog.java
        SSTableOutputMeta.java
        SSTableWriter.java
- spark-on-hbase
  - src
    - main
      - java
        com
        cloudera
        sa
        spark
        hbase
        example
        JavaHBaseBulkDeleteExample.java
        JavaHBaseBulkGetExample.java
        JavaHBaseBulkIncrementExample.java
        JavaHBaseBulkPutExample.java
        JavaHBaseDistributedScan.java
        JavaHBaseMapGetPutExample.java
        JavaHBaseStreamingBulkPutExample.java
        TestJavaLocalMainExample.java
    - test
      - java
        spark
        hbase
        JavaHBaseContextSuite.java
- zkpublisher
  - src
    - main
      - java
        com
        embracesource
        config
        ConfigChangeListener.java
        ConfigChangeSubscriber.java
        DynamicPropertiesHelper.java
        DynamicPropertiesHelperFactory.java
        DynamicPropertiesSpringConfigurer.java
        ZkConfigChangeSubscriberImpl.java
        ZkConfigPublisher.java
        ZkConfigSaver.java
        ZkUtils.java
    - test
      - java
        com
        embracesource
        config
        DynamicPropertiesHelperTest.java
        ReadPropertiesTest.java
        ZkConfigChangeSubscriberImplTest.java

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.asp.tranlog;

import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.util.Base64;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Mapper;

import com.asp.tranlog.ImportTsv.TsvParser.BadTsvLineException;

/**
 * Write table content out to files in hdfs.
 */
public class TsvImporterMapper extends
		Mapper<LongWritable, Text, ImmutableBytesWritable, Put> {
	private static final Log LOG = LogFactory.getLog(TsvImporterMapper.class);

	/** Timestamp for all inserted rows */
	private long ts;

	/** Column seperator */
	private String separator;

	/** Should skip bad lines */
	private boolean skipBadLines;
	private Counter badLineCount;
	private int[] keyColIndex = null;// The column index that will be used to
										// compose to a row key like
										// aaaa+bbb+ccc
	private int[] keyColLen = null;
	private byte[] columnTypes = null;
	private char[] colDatetimeFormater = null;// For columns with type
												// 'datetime', the formater will
												// be saved in this array.

	private String charset;
	private String hbase_rowkey_separator;

	public final static SimpleDateFormat[] datetimeParsers = {
			new SimpleDateFormat("MMM dd yyyy hh:mm:ss:SSSaa",
					new java.util.Locale("en")),// Dec 7 2012 3:35:30:453PM
			new SimpleDateFormat("yyyyMMdd", new java.util.Locale("en")) };

	private ImportTsv.TsvParser parser;

	public long getTs() {
		return ts;
	}

	public boolean getSkipBadLines() {
		return skipBadLines;
	}

	public Counter getBadLineCount() {
		return badLineCount;
	}

	public void incrementBadLineCount(int count) {
		this.badLineCount.increment(count);
	}

	/**
	 * Handles initializing this class with objects specific to it (i.e., the
	 * parser). Common initialization that might be leveraged by a subsclass is
	 * done in <code>doSetup</code>. Hence a subclass may choose to override
	 * this method and call <code>doSetup</code> as well before handling it's
	 * own custom params.
	 * 
	 * @param context
	 */
	@Override
	protected void setup(Context context) {
		doSetup(context);

		Configuration conf = context.getConfiguration();

		charset = conf.get(ImportTsv.CHARSET_CONF_KEY);

		parser = new ImportTsv.TsvParser(conf.get(ImportTsv.COLUMNS_CONF_KEY),
				conf.getStrings(ImportTsv.KEYCOLUMNS_CONF_KEY), separator);
		keyColIndex = parser.getRowKeyColumnIndex();
		keyColLen = parser.getRowKeyColumnLen();
		if (keyColIndex == null) {
			throw new RuntimeException("No row key column specified");
		}
		columnTypes = parser.getColType();
		if (columnTypes != null) {
			colDatetimeFormater = new char[columnTypes.length];
			for (int i = 0; i < columnTypes.length; i++)
				colDatetimeFormater[i] = 0;
		}
	}

	/**
	 * Handles common parameter initialization that a subclass might want to
	 * leverage.
	 * 
	 * @param context
	 */
	protected void doSetup(Context context) {
		Configuration conf = context.getConfiguration();

		// If a custom separator has been used,
		// decode it back from Base64 encoding.
		separator = conf.get(ImportTsv.SEPARATOR_CONF_KEY);
		if (separator == null) {
			separator = ImportTsv.DEFAULT_SEPARATOR;
		} else {
			separator = new String(Base64.decode(separator));
		}

		hbase_rowkey_separator = conf.get(ImportTsv.SEPARATOR_CONF_ROWKEY);
		if (hbase_rowkey_separator == null
				|| hbase_rowkey_separator.trim().length() == 0) {
			hbase_rowkey_separator = "";
		} else {
			hbase_rowkey_separator = new String(
					Base64.decode(hbase_rowkey_separator));
		}

		ts = conf.getLong(ImportTsv.TIMESTAMP_CONF_KEY,
				System.currentTimeMillis());

		skipBadLines = context.getConfiguration().getBoolean(
				ImportTsv.SKIP_LINES_CONF_KEY, true);
		badLineCount = context.getCounter("ImportTsv", "Bad Lines");
	}

	/**
	 * To find a date parser from the datetimeParsers array
	 * 
	 * @return
	 */
	protected Date parseTimestamp(byte[] byteVal, int colIdx)
			throws ParseException {
		Date rtnDate = null;
		String dateString = Bytes.toString(byteVal);
		if (colDatetimeFormater != null && colDatetimeFormater.length > colIdx) {
			int fmtrIdx = colDatetimeFormater[colIdx];
			try {
				rtnDate = datetimeParsers[fmtrIdx].parse(dateString);
			} catch (java.text.ParseException e) {
			}

			if (rtnDate == null) {
				for (int i = 0; i < datetimeParsers.length; i++) {
					try {
						rtnDate = datetimeParsers[i].parse(dateString);
					} catch (java.text.ParseException e) {
					}
					if (rtnDate != null) {
						colDatetimeFormater[colIdx] = (char) i;
						break;
					}
				}
			}
		}
		if (rtnDate == null) {
			LOG.error("No supported data format found: " + dateString);
			throw new ParseException("Failed to parse date: " + dateString, 0);
		}
		return rtnDate;
	}

	/**
	 * Extract byte array for column specified by colIdx.
	 * 
	 * @param lineBytes
	 * @param parsed
	 * @param colIdx
	 * @return
	 */
	protected byte[] getInputColBytes(byte[] lineBytes,
			ImportTsv.TsvParser.ParsedLine parsed, int colIdx) {
		if (colIdx >= columnTypes.length)
			return null;
		int colOffset = parsed.getColumnOffset(colIdx);
		int colLen = parsed.getColumnLength(colIdx);
		byte[] colBytes = new byte[colLen];
		Bytes.putBytes(colBytes, 0, lineBytes, colOffset, colLen);
		return colBytes;
	}

	/**
	 * To create rowkey byte array, the rule is like this: row key can be
	 * composed by several columns change every columns values to String, if
	 * column type is date, change to long first if column values are "kv1  ",
	 * "kv2", "  kv3", ... then the row key string will be "kv1  +kv2+  kv3",
	 * that means the space char will be kept
	 * 
	 * @param lineBytes
	 * @param parsed
	 * @return
	 * @throws BadTsvLineException
	 */
	protected byte[] createRowkeyByteArray(byte[] lineBytes,
			ImportTsv.TsvParser.ParsedLine parsed) throws BadTsvLineException {
		try {

			byte[] colBytes = null;
			Date tmpDate = null;
			StringBuffer sb = new StringBuffer();

			for (int i = 0; i < keyColIndex.length; i++) {
				if (i > 0 && hbase_rowkey_separator.length() > 0)
					sb.append(hbase_rowkey_separator);
				colBytes = getInputColBytes(lineBytes, parsed, keyColIndex[i]);
				if (colBytes == null)
					throw new BadTsvLineException(
							"Failed to get column bytes for " + keyColIndex[i]);
				String rowCol;
				if (columnTypes[keyColIndex[i]] == ImportTsv.COL_TYPE_DATETIME) {
					tmpDate = parseTimestamp(colBytes, keyColIndex[i]);
					rowCol = Long.toString(tmpDate.getTime());
					sb.append(rowCol);
				} else if (columnTypes[keyColIndex[i]] == ImportTsv.COL_TYPE_STRING) {
					// String lineStr = new String(value.getBytes(), 0,
					// value.getLength(), "gb18030");
					// byte[] lineBytes = new Text(lineStr).getBytes();

					if (StringUtils.isEmpty(charset))
						charset = HConstants.UTF8_ENCODING;

					String lineStr = new String(colBytes, charset);
					colBytes = new Text(lineStr).getBytes();

					rowCol = Bytes.toString(colBytes);
					// if original string len < specified string len, then use
					// substring, else using space to right pad.
					if (keyColLen[i] != 0 && rowCol.length() > keyColLen[i])
						sb.append(rowCol.substring(0, keyColLen[i]));
					else
						sb.append(StringUtils.rightPad(rowCol, keyColLen[i]));
				} else if (columnTypes[keyColIndex[i]] == ImportTsv.COL_TYPE_INT) {
					int intVal = Integer.parseInt(Bytes.toString(colBytes));
					rowCol = Integer.toString(intVal);
					sb.append(StringUtils.leftPad(rowCol, keyColLen[i], '0'));
				} else if (columnTypes[keyColIndex[i]] == ImportTsv.COL_TYPE_DOUBLE) {
					double dbval = Double.parseDouble(Bytes.toString(colBytes));
					rowCol = Double.toString(dbval);
					sb.append(rowCol);
				} else if (columnTypes[keyColIndex[i]] == ImportTsv.COL_TYPE_LONG) {
					long longVal = Long.parseLong(Bytes.toString(colBytes));
					rowCol = Long.toString(longVal);
					sb.append(StringUtils.leftPad(rowCol, keyColLen[i], '0'));
				} else {
					rowCol = Bytes.toString(colBytes);
					// if original string len < specified string len, then use
					// substring, else using space to right pad.
					if (keyColLen[i] != 0 && rowCol.length() > keyColLen[i])
						sb.append(rowCol.substring(0, keyColLen[i]));
					else
						sb.append(StringUtils.rightPad(rowCol, keyColLen[i]));
				}
			}
			return sb.toString().getBytes();
		} catch (Exception e) {
			throw new BadTsvLineException(e.getMessage());
		}
	}

	/**
	 * 
	 * @param lineBytes
	 * @param parsed
	 * @param colIdx
	 * @return
	 */
	protected byte[] convertColBytes(byte[] lineBytes,
			ImportTsv.TsvParser.ParsedLine parsed, int colIdx)
			throws BadTsvLineException {
		byte[] rtn = null;
		byte[] srcBytes = getInputColBytes(lineBytes, parsed, colIdx);
		try {
			if (columnTypes[colIdx] == ImportTsv.COL_TYPE_DATETIME) {
				Date tmpDate = parseTimestamp(srcBytes, colIdx);
				;
				rtn = Bytes.toBytes(tmpDate.getTime());
			} else if (columnTypes[colIdx] == ImportTsv.COL_TYPE_INT) {
				int intVal = Integer.parseInt(Bytes.toString(srcBytes));
				rtn = Bytes.toBytes(intVal);
			} else if (columnTypes[colIdx] == ImportTsv.COL_TYPE_DOUBLE) {
				double dbval = Double.parseDouble(Bytes.toString(srcBytes));
				rtn = Bytes.toBytes(dbval);
			} else if (columnTypes[colIdx] == ImportTsv.COL_TYPE_LONG) {
				long longVal = Long.parseLong(Bytes.toString(srcBytes));
				rtn = Bytes.toBytes(longVal);
			} else {
				rtn = srcBytes;
			}
		} catch (Exception e) {
			throw new BadTsvLineException(e.getMessage());
		}
		return rtn;
	}

	/**
	 * Convert a line of TSV text into an HBase table row.
	 */
	@Override
	public void map(LongWritable offset, Text value, Context context)
			throws IOException {

		byte[] lineBytes = value.getBytes();

		// String lineStr = new String(value.getBytes(), 0, value.getLength(),
		// "gb18030");
		// byte[] lineBytes = new Text(lineStr).getBytes();

		int i = 0;
		try {
			ImportTsv.TsvParser.ParsedLine parsed = parser.parse(lineBytes,
					value.getLength());

			// ImportTsv.TsvParser.ParsedLine parsed = parser.parse(
			// lineBytes, Text.utf8Length(lineStr));

			byte[] rowKeyBytes = createRowkeyByteArray(lineBytes, parsed);
			ImmutableBytesWritable rowKey = new ImmutableBytesWritable(
					rowKeyBytes);

			Put put = new Put(rowKeyBytes);
			put.setWriteToWAL(false);

			for (i = 0; i < parsed.getColumnCount(); i++) {

				KeyValue kv = null;
				if (columnTypes[i] == ImportTsv.COL_TYPE_STRING) {
					kv = new KeyValue(rowKeyBytes, parser.getFamily(i),
							parser.getQualifier(i), 0,
							parser.getQualifier(i).length, ts,
							KeyValue.Type.Put, lineBytes,
							parsed.getColumnOffset(i),
							parsed.getColumnLength(i));
				} else {
					byte[] colBytes = convertColBytes(lineBytes, parsed, i);
					if (colBytes == null)
						throw new ImportTsv.TsvParser.BadTsvLineException(
								"Failed to get bytes for column " + i);
					kv = new KeyValue(rowKeyBytes, parser.getFamily(i),
							parser.getQualifier(i), ts, colBytes);
				}
				if (kv == null)
					throw new ImportTsv.TsvParser.BadTsvLineException(
							"Failed to get bytes for column " + i);
				put.add(kv);
			}
			context.write(rowKey, put);
		} catch (ImportTsv.TsvParser.BadTsvLineException badLine) {
			if (skipBadLines) {
				System.err.println("Bad line: "
						+ new String(lineBytes, "gb18030") + ":" + i + "\n");
				LOG.error("Bad line: " + new String(lineBytes, "gb18030") + ","
						+ i);
				incrementBadLineCount(1);
				return;
			} else {
				throw new IOException(badLine);
			}
		} catch (IllegalArgumentException e) {
			if (skipBadLines) {
				System.err.println("Bad line: "
						+ new String(lineBytes, "gb18030") + ":" + i + "\n");
				LOG.error("Bad line: " + new String(lineBytes, "gb18030") + ","
						+ i);
				incrementBadLineCount(1);
				return;
			} else {
				throw new IOException(e);
			}
		} catch (InterruptedException e) {
			e.printStackTrace();
		}
	}
}