/******************************************************************************* * * Pentaho Big Data * * Copyright (C) 2002-2012 by Pentaho : http://www.pentaho.com * ******************************************************************************* * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ******************************************************************************/ package org.pentaho.di.trans.steps.cassandrainput; import java.nio.ByteBuffer; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import org.apache.cassandra.thrift.Column; import org.apache.cassandra.thrift.Compression; import org.apache.cassandra.thrift.CqlResult; import org.apache.cassandra.thrift.CqlRow; import org.pentaho.cassandra.CassandraColumnMetaData; import org.pentaho.cassandra.CassandraConnection; import org.pentaho.di.core.Const; import org.pentaho.di.core.exception.KettleException; import org.pentaho.di.core.row.RowMeta; import org.pentaho.di.i18n.BaseMessages; import org.pentaho.di.trans.Trans; import org.pentaho.di.trans.TransMeta; import org.pentaho.di.trans.step.BaseStep; import org.pentaho.di.trans.step.StepDataInterface; import org.pentaho.di.trans.step.StepInterface; import org.pentaho.di.trans.step.StepMeta; import org.pentaho.di.trans.step.StepMetaInterface; /** * Class providing an input step for reading data from a table (column family) * in Cassandra. Accesses the schema information stored in Cassandra for type * information. * * @author Mark Hall (mhall{[at]}pentaho{[dot]}com) * @version $Revision$ */ public class CassandraInput extends BaseStep implements StepInterface { protected CassandraInputMeta m_meta; protected CassandraInputData m_data; public CassandraInput(StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta, Trans trans) { super(stepMeta, stepDataInterface, copyNr, transMeta, trans); } /** Connection to cassandra */ protected CassandraConnection m_connection; /** Column meta data and schema information */ protected CassandraColumnMetaData m_cassandraMeta; /** For iterating over a result set */ protected Iterator<CqlRow> m_resultIterator; /** * map of indexes into the output field structure (key is special - it's * always the first field in the output row meta */ protected Map<String, Integer> m_outputFormatMap = new HashMap<String, Integer>(); @Override public boolean processRow(StepMetaInterface smi, StepDataInterface sdi) throws KettleException { if (first) { first = false; m_data = (CassandraInputData) sdi; m_meta = (CassandraInputMeta) smi; // Get the connection to Cassandra String hostS = environmentSubstitute(m_meta.getCassandraHost()); String portS = environmentSubstitute(m_meta.getCassandraPort()); String timeoutS = environmentSubstitute(m_meta.getSocketTimeout()); String userS = m_meta.getUsername(); String passS = m_meta.getPassword(); if (!Const.isEmpty(userS) && !Const.isEmpty(passS)) { userS = environmentSubstitute(userS); passS = environmentSubstitute(passS); } String keyspaceS = environmentSubstitute(m_meta.getCassandraKeyspace()); if (Const.isEmpty(hostS) || Const.isEmpty(portS) || Const.isEmpty(keyspaceS)) { throw new KettleException("Some connection details are missing!!"); } logBasic(BaseMessages.getString(CassandraInputMeta.PKG, "CassandraInput.Info.Connecting", hostS, portS, keyspaceS)); try { if (Const.isEmpty(timeoutS)) { m_connection = CassandraInputData.getCassandraConnection(hostS, Integer.parseInt(portS), userS, passS); } else { m_connection = CassandraInputData .getCassandraConnection(hostS, Integer.parseInt(portS), userS, passS, Integer.parseInt(timeoutS)); } m_connection.setKeyspace(keyspaceS); } catch (Exception ex) { closeConnection(); throw new KettleException(ex.getMessage(), ex); } // check the source column family (table) first String colFamName = m_data .getColumnFamilyNameFromCQLSelectQuery(environmentSubstitute(m_meta .getCQLSelectQuery())); if (Const.isEmpty(colFamName)) { throw new KettleException(BaseMessages.getString( CassandraInputMeta.PKG, "CassandraInput.Error.NonExistentColumnFamily")); } try { if (!CassandraColumnMetaData.columnFamilyExists(m_connection, colFamName)) { throw new KettleException(BaseMessages.getString( CassandraInputMeta.PKG, "CassandraInput.Error.NonExistentColumnFamily", colFamName, keyspaceS)); } } catch (Exception ex) { closeConnection(); throw new KettleException(ex.getMessage(), ex); } // set up the output row meta m_data.setOutputRowMeta(new RowMeta()); m_meta.getFields(m_data.getOutputRowMeta(), getStepname(), null, null, this); // check that there are some outgoing fields! if (m_data.getOutputRowMeta().size() == 0) { throw new KettleException(BaseMessages.getString( CassandraInputMeta.PKG, "CassandraInput.Error.QueryWontProduceOutputFields")); } // set up the lookup map if (!m_meta.getOutputKeyValueTimestampTuples()) { for (int i = 0; i < m_data.getOutputRowMeta().size(); i++) { String fieldName = m_data.getOutputRowMeta().getValueMeta(i) .getName(); m_outputFormatMap.put(fieldName, i); } } // column family name (key) is the first field output try { logBasic(BaseMessages.getString(CassandraInputMeta.PKG, "CassandraInput.Info.GettintMetaData", colFamName)); m_cassandraMeta = new CassandraColumnMetaData(m_connection, colFamName); } catch (Exception e) { closeConnection(); throw new KettleException(e.getMessage(), e); } String queryS = environmentSubstitute(m_meta.getCQLSelectQuery()); Compression compression = m_meta.getUseCompression() ? Compression.GZIP : Compression.NONE; try { if (!m_meta.getUseThriftIO()) { logBasic(BaseMessages.getString( CassandraInputMeta.PKG, "CassandraInput.Info.ExecutingQuery", queryS, (m_meta.getUseCompression() ? BaseMessages.getString( CassandraInputMeta.PKG, "CassandraInput.Info.UsingGZIPCompression") : ""))); byte[] queryBytes = (m_meta.getUseCompression() ? CassandraInputData .compressQuery(queryS, compression) : queryS.getBytes()); // In Cassandra 1.1 the version of CQL to use can be set // programatically. The default // is to use CQL v 2.0.0 // m_connection.getClient().set_cql_version("3.0.0"); CqlResult result = m_connection.getClient().execute_cql_query( ByteBuffer.wrap(queryBytes), compression); m_resultIterator = result.getRowsIterator(); } else if (m_meta.getOutputKeyValueTimestampTuples()) { // --------------- use thrift IO (only applicable for <key, value> // tuple mode at present) ---------- List<String> userCols = (m_meta.m_specificCols != null && m_meta.m_specificCols .size() > 0) ? m_meta.m_specificCols : null; m_data.sliceModeInit(m_cassandraMeta, userCols, m_meta.m_rowLimit, m_meta.m_colLimit, m_meta.m_rowBatchSize, m_meta.m_colBatchSize); List<Object[]> batch = m_data.cassandraRowToKettleTupleSliceMode( m_cassandraMeta, m_connection); while (batch != null) { for (Object[] r : batch) { putRow(m_data.getOutputRowMeta(), r); if (log.isRowLevel()) { log.logRowlevel(toString(), "Outputted row #" + getProcessed() + " : " + r); } } batch = m_data.cassandraRowToKettleTupleSliceMode(m_cassandraMeta, m_connection); } // done closeConnection(); setOutputDone(); return false; // --------------- end thrift IO mode } } catch (Exception e) { closeConnection(); throw new KettleException(e.getMessage(), e); } } if (m_resultIterator.hasNext()) { CqlRow nextRow = m_resultIterator.next(); Object[] outputRowData = null; if (m_meta.getOutputKeyValueTimestampTuples()) { Iterator<Column> columnIterator = nextRow.getColumnsIterator(); // The key always appears to be the first column in the list (even // though it is separately // avaliable via CqlRow.getKey(). We discard it here because testing for // a column named // "KEY" only works if column names are textual // ARGHHHHH! - this assumption is only true for wildcard queries!!!!!! // (i.e. select *)!!!!!! // So select col1, col2 etc. or ranges (which we don't support) will not // include the row key // as the first column if (m_meta.m_isSelectStarQuery) { columnIterator.next(); // throw away the key column } while ((outputRowData = m_data.cassandraRowToKettleTupleMode( m_cassandraMeta, nextRow, columnIterator)) != null) { putRow(m_data.getOutputRowMeta(), outputRowData); if (log.isRowLevel()) { log.logRowlevel(toString(), "Outputted row #" + getProcessed() + " : " + outputRowData); } } } else { outputRowData = m_data.cassandraRowToKettle(m_cassandraMeta, nextRow, m_outputFormatMap); // output the row putRow(m_data.getOutputRowMeta(), outputRowData); if (log.isRowLevel()) { log.logRowlevel(toString(), "Outputted row #" + getProcessed() + " : " + outputRowData); } } } else { closeConnection(); setOutputDone(); return false; } if (checkFeedback(getProcessed())) { logBasic("Read " + getProcessed() + " rows from Cassandra"); } return true; } @Override public void setStopped(boolean stopped) { if (isStopped() && stopped == true) { return; } super.setStopped(stopped); if (stopped) { closeConnection(); } } protected void closeConnection() { if (m_connection != null) { logBasic(BaseMessages.getString(CassandraInputMeta.PKG, "CassandraInput.Info.ClosingConnection")); m_connection.close(); } } }