/******************************************************************************* * * Pentaho Big Data * * Copyright (C) 2002-2012 by Pentaho : http://www.pentaho.com * ******************************************************************************* * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ******************************************************************************/ package org.pentaho.di.trans.steps.cassandrainput; import java.io.ByteArrayOutputStream; import java.nio.ByteBuffer; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Date; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.zip.Deflater; import org.apache.cassandra.thrift.Column; import org.apache.cassandra.thrift.ColumnOrSuperColumn; import org.apache.cassandra.thrift.ColumnParent; import org.apache.cassandra.thrift.Compression; import org.apache.cassandra.thrift.ConsistencyLevel; import org.apache.cassandra.thrift.CqlRow; import org.apache.cassandra.thrift.KeyRange; import org.apache.cassandra.thrift.KeySlice; import org.apache.cassandra.thrift.SlicePredicate; import org.apache.cassandra.thrift.SliceRange; import org.apache.cassandra.thrift.TimedOutException; import org.pentaho.cassandra.CassandraColumnMetaData; import org.pentaho.cassandra.CassandraConnection; import org.pentaho.di.core.Const; import org.pentaho.di.core.exception.KettleException; import org.pentaho.di.core.row.RowDataUtil; import org.pentaho.di.core.row.RowMetaInterface; import org.pentaho.di.core.row.ValueMeta; import org.pentaho.di.core.row.ValueMetaInterface; import org.pentaho.di.trans.step.BaseStepData; import org.pentaho.di.trans.step.StepDataInterface; /** * Data class for the CassandraInput step. Contains some utility methods for * obtaining a connection to cassandra, translating a row from cassandra to * Kettle and for compressing a query. * * @author Mark Hall (mhall{[at]}pentaho{[dot]}com) * @version $Revision$ */ public class CassandraInputData extends BaseStepData implements StepDataInterface { /** The output data format */ protected RowMetaInterface m_outputRowMeta; /** * Get the output row format * * @return the output row format */ public RowMetaInterface getOutputRowMeta() { return m_outputRowMeta; } /** * Set the output row format * * @param rmi the output row format */ public void setOutputRowMeta(RowMetaInterface rmi) { m_outputRowMeta = rmi; } /** * Get a connection to cassandra * * @param host the hostname of a cassandra node * @param port the port that cassandra is listening on * @param username the username for (optional) authentication * @param password the password for (optional) authentication * @return a connection to cassandra * @throws Exception if a problem occurs during connection */ public static CassandraConnection getCassandraConnection(String host, int port, String username, String password) throws Exception { return new CassandraConnection(host, port, username, password, -1); } /** * Get a connection to cassandra * * @param host the hostname of a cassandra node * @param port the port that cassandra is listening on * @param username the username for (optional) authentication * @param password the password for (optional) authentication * @param timeout the socket timeout to use * @return a connection to cassandra * @throws Exception if a problem occurs during connection */ public static CassandraConnection getCassandraConnection(String host, int port, String username, String password, int timeout) throws Exception { return new CassandraConnection(host, port, username, password, timeout); } // ------------------------------------------------------------------ // The following code implements pure Thrift-based <key, col_name, value, // timestamp> // tuple extraction protected boolean m_newSliceQuery = false; protected List<String> m_requestedCols = null; protected int m_sliceRowsMax; protected int m_sliceColsMax; protected int m_sliceRowsBatchSize; protected int m_sliceColsBatchSize; protected SliceRange m_sliceRange; protected KeyRange m_keyRange; protected SlicePredicate m_slicePredicate; protected ColumnParent m_colParent; int m_rowIndex; int m_colIndex; // current batch of rows protected List<KeySlice> m_cassandraRows; // current batch of columns from current row protected List<ColumnOrSuperColumn> m_currentCols; protected List<Object[]> m_converted; protected int m_colCount; protected int m_rowCount; public void sliceModeInit(CassandraColumnMetaData meta, List<String> colNames, int maxRows, int maxCols, int rowBatchSize, int colBatchSize) throws KettleException { m_newSliceQuery = true; m_requestedCols = colNames; m_sliceRowsMax = maxRows; m_sliceColsMax = maxCols; m_sliceRowsBatchSize = rowBatchSize; m_sliceColsBatchSize = colBatchSize; m_rowIndex = 0; m_colIndex = 0; if (m_sliceColsBatchSize <= 0) { m_sliceColsBatchSize = Integer.MAX_VALUE; } if (m_sliceRowsBatchSize <= 0) { m_sliceRowsBatchSize = Integer.MAX_VALUE; } List<ByteBuffer> specificCols = null; if (m_requestedCols != null && m_requestedCols.size() > 0) { specificCols = new ArrayList<ByteBuffer>(); // encode the textual column names for (String colName : m_requestedCols) { ByteBuffer encoded = meta.columnNameToByteBuffer(colName); specificCols.add(encoded); } } m_slicePredicate = new SlicePredicate(); if (specificCols == null) { m_sliceRange = new SliceRange(ByteBuffer.wrap(new byte[0]), ByteBuffer.wrap(new byte[0]), false, m_sliceColsBatchSize); m_slicePredicate.setSlice_range(m_sliceRange); } else { m_slicePredicate.setColumn_names(specificCols); } m_keyRange = new KeyRange(m_sliceRowsBatchSize); m_keyRange.setStart_key(new byte[0]); m_keyRange.setEnd_key(new byte[0]); m_colParent = new ColumnParent(meta.getColumnFamilyName()); m_converted = new ArrayList<Object[]>(); } private void advanceToNonEmptyRow() { KeySlice row = m_cassandraRows.get(m_rowIndex); m_currentCols = row.getColumns(); int skipSize = 0; while (m_currentCols.size() == skipSize && m_rowIndex < m_cassandraRows.size() - 1) { m_rowIndex++; row = m_cassandraRows.get(m_rowIndex); m_currentCols = row.getColumns(); } if (m_currentCols.size() == skipSize) { // we've been through the batch and there are no columns in any of these // rows - // so nothing to output! Indicate this by setting currentCols to null m_currentCols = null; } } private void getNextBatchOfRows(CassandraConnection conn) throws Exception { // reset the column range (if necessary) if (m_requestedCols == null) { m_sliceRange = m_sliceRange.setStart(ByteBuffer.wrap(new byte[0])); m_sliceRange = m_sliceRange.setFinish(ByteBuffer.wrap(new byte[0])); m_slicePredicate.setSlice_range(m_sliceRange); } // set the key range start to the last key from the last batch of rows m_keyRange.setStart_key(m_cassandraRows.get(m_cassandraRows.size() - 1) .getKey()); m_cassandraRows = conn.getClient().get_range_slices(m_colParent, m_slicePredicate, m_keyRange, ConsistencyLevel.ONE); m_colCount = 0; // key ranges are *inclusive* of the start key - we will have already // processed the first // row in the last batch. Hence start at index 1 of this batch m_rowIndex = 1; if (m_cassandraRows == null || m_cassandraRows.size() <= 1 || m_rowCount == m_sliceRowsMax) { // indicate done m_currentCols = null; m_cassandraRows = null; } else { advanceToNonEmptyRow(); } } private void getNextBatchOfColumns(CassandraConnection conn) throws Exception { m_sliceRange = m_sliceRange.setStart(m_currentCols .get(m_currentCols.size() - 1).getColumn().bufferForName()); m_slicePredicate.setSlice_range(m_sliceRange); // fetch the next bunch of columns for the current row m_currentCols = conn.getClient().get_slice( m_cassandraRows.get(m_rowIndex).bufferForKey(), m_colParent, m_slicePredicate, ConsistencyLevel.ONE); // as far as I understand it - these things are always inclusive of the // start element, // so we need to skip the first element cause it was processed already in // the last batch // of columns if (m_currentCols == null || m_currentCols.size() <= 1) { // no more columns in the current row - move to the next row m_rowCount++; m_rowIndex++; m_colCount = 0; if (m_rowIndex == m_cassandraRows.size()) { getNextBatchOfRows(conn); while (m_cassandraRows != null && m_currentCols == null) { // keep going until we get some rows with columns! getNextBatchOfRows(conn); } } else { advanceToNonEmptyRow(); while (m_cassandraRows != null && m_currentCols == null) { // keep going until we get some rows with columns! getNextBatchOfRows(conn); } } } else { // we need to discard the first col in the list since we will have // processed // that already in the batch m_currentCols.remove(0); } } public List<Object[]> cassandraRowToKettleTupleSliceMode( CassandraColumnMetaData metaData, CassandraConnection conn) throws KettleException { m_converted.clear(); int timeouts = 0; try { while (timeouts < 5) { try { if (m_newSliceQuery) { m_cassandraRows = conn.getClient().get_range_slices(m_colParent, m_slicePredicate, m_keyRange, ConsistencyLevel.ONE); if (m_cassandraRows == null || m_cassandraRows.size() == 0) { // done return null; } else { advanceToNonEmptyRow(); while (m_cassandraRows != null && m_currentCols == null) { // keep going until we get some rows with columns! getNextBatchOfRows(conn); } if (m_cassandraRows == null) { // we're done return null; } m_colCount = 0; m_rowCount = 0; m_newSliceQuery = false; } } else { // determine what we need to get next - more columns from current // row, or start next row // or get next row batch or done if (m_rowCount == m_sliceRowsMax) { // hit our LIMIT of rows - done return null; } if (m_rowIndex == m_cassandraRows.size()) { // get next batch of rows getNextBatchOfRows(conn); while (m_cassandraRows != null && m_currentCols == null) { // keep going until we get some rows with columns! getNextBatchOfRows(conn); } if (m_cassandraRows == null) { // we're done return null; } } else if (m_colCount == -1) { // get next row KeySlice row = m_cassandraRows.get(m_rowIndex); m_currentCols = row.getColumns(); m_colCount = 0; } else { getNextBatchOfColumns(conn); // check against our limit again if (m_rowCount == m_sliceRowsMax) { return null; } if (m_cassandraRows == null) { // we're done return null; } } } break; } catch (TimedOutException e) { timeouts++; } } if (timeouts == 5) { throw new KettleException( "Maximum number of consecutive timeouts exceeded"); } KeySlice row = m_cassandraRows.get(m_rowIndex); Object rowKey = metaData.getKeyValue(row); if (rowKey == null) { throw new KettleException("Unable to obtain a key value for the row!"); } String keyName = metaData.getKeyName(); int keyIndex = m_outputRowMeta.indexOfValue(keyName); if (keyIndex < 0) { throw new KettleException("Unable to find the key field name '" + keyName + "' in the output row meta data!"); } for (int i = 0; i < m_currentCols.size(); i++) { Object[] outputRowData = RowDataUtil.allocateRowData(m_outputRowMeta .size()); outputRowData[keyIndex] = rowKey; Column col = m_currentCols.get(i).getColumn(); String colName = metaData.getColumnName(col); Object colValue = metaData.getColumnValue(col); if (colValue == null) { // skip null columns (only applies if we're processing // a specified list of columns rather than all columns). continue; } outputRowData[1] = colName; String stringV = colValue.toString(); outputRowData[2] = stringV; if (colValue instanceof Date) { ValueMeta tempDateMeta = new ValueMeta("temp", ValueMetaInterface.TYPE_DATE); stringV = tempDateMeta.getString(colValue); outputRowData[2] = stringV; } else if (colValue instanceof byte[]) { outputRowData[2] = colValue; } // the timestamp as a date object long timestampL = col.getTimestamp(); outputRowData[3] = timestampL; m_converted.add(outputRowData); m_colCount++; if (m_colCount == m_sliceColsMax && m_requestedCols == null) { // max number of cols reached for this row m_colCount = -1; // indicate move to the next row m_rowCount++; m_rowIndex++; break; // don't process any more } } if (m_requestedCols != null) { // assume that we don't need to page columns when the user has // explicitly named the ones that they want m_colCount = -1; m_rowCount++; m_rowIndex++; } } catch (Exception ex) { throw new KettleException(ex.getMessage(), ex); } return m_converted; } // --------------- End Thrift-based tuple mode ------------------------- /** * Converts a cassandra row to a Kettle row in the key, colName, colValue, * timestamp format * * @param metaData meta data on the cassandra column family being read from * @param cassandraRow a row from the column family * @param cassandraColIter an interator over columns for the current row * * @return a Kettle row * @throws KettleException if a problem occurs */ public Object[] cassandraRowToKettleTupleMode( CassandraColumnMetaData metaData, CqlRow cassandraRow, Iterator<Column> cassandraColIter) throws KettleException { Object[] outputRowData = RowDataUtil .allocateRowData(m_outputRowMeta.size()); Object key = metaData.getKeyValue(cassandraRow); if (key == null) { throw new KettleException("Unable to obtain a key value for the row!"); } String keyName = metaData.getKeyName(); int keyIndex = m_outputRowMeta.indexOfValue(keyName); if (keyIndex < 0) { throw new KettleException("Unable to find the key field name '" + keyName + "' in the output row meta data!"); } outputRowData[keyIndex] = key; // advance the iterator to the next column if (cassandraColIter.hasNext()) { Column aCol = cassandraColIter.next(); String colName = metaData.getColumnName(aCol); // skip the key if (colName.equals("KEY")) { if (cassandraColIter.hasNext()) { aCol = cassandraColIter.next(); colName = metaData.getColumnName(aCol); } else { // run out of columns return null; } } // for queries that specify column names we need to check that the value // is not null in this row while (metaData.getColumnValue(aCol) == null) { if (cassandraColIter.hasNext()) { aCol = cassandraColIter.next(); colName = metaData.getColumnName(aCol); } else { return null; } } outputRowData[1] = colName; // do the value (stored as a string) Object colValue = metaData.getColumnValue(aCol); String stringV = colValue.toString(); outputRowData[2] = stringV; if (colValue instanceof Date) { ValueMeta tempDateMeta = new ValueMeta("temp", ValueMetaInterface.TYPE_DATE); stringV = tempDateMeta.getString(colValue); outputRowData[2] = stringV; } else if (colValue instanceof byte[]) { outputRowData[2] = colValue; } // the timestamp as a date object long timestampL = aCol.getTimestamp(); outputRowData[3] = timestampL; } else { return null; // signify no more columns for this row... } return outputRowData; } /** * Converts a cassandra row to a Kettle row * * @param metaData meta data on the cassandra column family being read from * @param cassandraRow a row from the column family * @param outputFormatMap a Map of output field names to indexes in the * outgoing Kettle row structure * @return a Kettle row * @throws KettleException if a problem occurs */ public Object[] cassandraRowToKettle(CassandraColumnMetaData metaData, CqlRow cassandraRow, Map<String, Integer> outputFormatMap) throws KettleException { Object[] outputRowData = RowDataUtil .allocateRowData(m_outputRowMeta.size()); Object key = metaData.getKeyValue(cassandraRow); if (key == null) { throw new KettleException("Unable to obtain a key value for the row!"); } String keyName = metaData.getKeyName(); int keyIndex = m_outputRowMeta.indexOfValue(keyName); if (keyIndex < 0) { throw new KettleException("Unable to find the key field name '" + keyName + "' in the output row meta data!"); } outputRowData[keyIndex] = key; // do the columns List<Column> rowColumns = cassandraRow.getColumns(); for (Column aCol : rowColumns) { String colName = metaData.getColumnName(aCol); Integer outputIndex = outputFormatMap.get(colName); if (outputIndex != null) { Object colValue = metaData.getColumnValue(aCol); outputRowData[outputIndex.intValue()] = colValue; } } return outputRowData; } /** * Extract the column family name (table name) from a CQL SELECT query. * Assumes that any kettle variables have been already substituted in the * query * * @param subQ the query with vars substituted * @return the column family name or null if the query is malformed */ public static String getColumnFamilyNameFromCQLSelectQuery(String subQ) { String result = null; if (Const.isEmpty(subQ)) { return null; } // assumes env variables already replaced in query! if (!subQ.toLowerCase().startsWith("select")) { // not a select statement! return null; } if (subQ.indexOf(';') < 0) { // query must end with a ';' or it will wait for more! return null; } // subQ = subQ.toLowerCase(); // strip off where clause (if any) if (subQ.toLowerCase().lastIndexOf("where") > 0) { subQ = subQ.substring(0, subQ.toLowerCase().lastIndexOf("where")); } // determine the source column family // look for a FROM that is surrounded by space int fromIndex = subQ.toLowerCase().indexOf("from"); String tempS = subQ.toLowerCase(); int offset = fromIndex; while (fromIndex > 0 && tempS.charAt(fromIndex - 1) != ' ' && (fromIndex + 4 < tempS.length()) && tempS.charAt(fromIndex + 4) != ' ') { tempS = tempS.substring(fromIndex + 4, tempS.length()); fromIndex = tempS.indexOf("from"); offset += (4 + fromIndex); } fromIndex = offset; if (fromIndex < 0) { return null; // no from clause } result = subQ.substring(fromIndex + 4, subQ.length()).trim(); if (result.indexOf(' ') > 0) { result = result.substring(0, result.indexOf(' ')); } else { result = result.replace(";", ""); } if (result.length() == 0) { return null; // no column family specified } return result; } /** * Compress a CQL query * * @param queryStr the CQL query * @param compression compression option (GZIP is the only option - so far) * @return an array of bytes containing the compressed query */ public static byte[] compressQuery(String queryStr, Compression compression) { byte[] data = queryStr.getBytes(Charset .forName(CassandraColumnMetaData.UTF8)); Deflater compressor = new Deflater(); compressor.setInput(data); compressor.finish(); ByteArrayOutputStream byteArray = new ByteArrayOutputStream(); byte[] buffer = new byte[1024]; while (!compressor.finished()) { int size = compressor.deflate(buffer); byteArray.write(buffer, 0, size); } return byteArray.toByteArray(); } }