/******************************************************************************* * * Pentaho Big Data * * Copyright (C) 2002-2012 by Pentaho : http://www.pentaho.com * ******************************************************************************* * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ******************************************************************************/ package org.pentaho.di.trans.steps.cassandrainput; import java.util.ArrayList; import java.util.List; import java.util.Map; import org.eclipse.swt.widgets.Shell; import org.pentaho.cassandra.CassandraColumnMetaData; import org.pentaho.cassandra.CassandraConnection; import org.pentaho.di.core.CheckResultInterface; import org.pentaho.di.core.Const; import org.pentaho.di.core.Counter; import org.pentaho.di.core.annotations.Step; import org.pentaho.di.core.database.DatabaseMeta; import org.pentaho.di.core.encryption.Encr; import org.pentaho.di.core.exception.KettleException; import org.pentaho.di.core.exception.KettleStepException; import org.pentaho.di.core.exception.KettleXMLException; import org.pentaho.di.core.row.RowMetaInterface; import org.pentaho.di.core.row.ValueMeta; import org.pentaho.di.core.row.ValueMetaInterface; import org.pentaho.di.core.variables.VariableSpace; import org.pentaho.di.core.xml.XMLHandler; import org.pentaho.di.i18n.BaseMessages; import org.pentaho.di.repository.ObjectId; import org.pentaho.di.repository.Repository; import org.pentaho.di.trans.Trans; import org.pentaho.di.trans.TransMeta; import org.pentaho.di.trans.step.BaseStepMeta; import org.pentaho.di.trans.step.StepDataInterface; import org.pentaho.di.trans.step.StepDialogInterface; import org.pentaho.di.trans.step.StepInterface; import org.pentaho.di.trans.step.StepMeta; import org.pentaho.di.trans.step.StepMetaInterface; import org.w3c.dom.Node; /** * Class providing an input step for reading data from an Cassandra column * family (table). * * @author Mark Hall (mhall{[at]}pentaho{[dot]}com) * @version $Revision$ */ @Step(id = "CassandraInput", image = "Cassandra.png", name = "Cassandra Input", description = "Reads data from a Cassandra table", categoryDescription = "Big Data") public class CassandraInputMeta extends BaseStepMeta implements StepMetaInterface { protected static final Class<?> PKG = CassandraInputMeta.class; /** The host to contact */ protected String m_cassandraHost = "localhost"; /** The port that cassandra is listening on */ protected String m_cassandraPort = "9160"; /** Username for authentication */ protected String m_username; /** Password for authentication */ protected String m_password; /** The keyspace (database) to use */ protected String m_cassandraKeyspace; /** Whether to use GZIP compression of CQL queries */ protected boolean m_useCompression; /** The select query to execute */ protected String m_cqlSelectQuery = "SELECT <fields> FROM <column family> WHERE <condition>;"; /** Output in tuple mode? */ protected boolean m_outputKeyValueTimestampTuples; /** Use thrift IO for tuple mode? */ protected boolean m_useThriftIO = false; /** * Timeout (milliseconds) to use for socket connections - blank means use * cluster default */ protected String m_socketTimeout = ""; // set based on parsed CQL /** * True if a select * is being done - this is important to know because rows * from select * queries contain the key as the first column. Key is also * available separately in the API (and we use this for retrieving the key). * The column that contains the key in this case is not necessarily * convertible using the default column validator because there is a separate * key validator. So we need to be able to recognize the key when it appears * as a column and skip it. Can't rely on it's name (KEY) since this is only * easily detectable when the column names are strings. */ protected boolean m_isSelectStarQuery = false; // these are set based on the parsed CQL when executing tuple mode using // thrift protected int m_rowLimit = -1; // no limit - otherwise we look for LIMIT in // CQL protected int m_colLimit = -1; // no limit - otherwise we look for FIRST N in // CQL // maximum number of rows or columns to pull over at one time via thrift protected int m_rowBatchSize = 100; protected int m_colBatchSize = 100; protected List<String> m_specificCols; /** * Set the timeout (milliseconds) to use for socket comms * * @param t the timeout to use in milliseconds */ public void setSocketTimeout(String t) { m_socketTimeout = t; } /** * Get the timeout (milliseconds) to use for socket comms * * @return the timeout to use in milliseconds */ public String getSocketTimeout() { return m_socketTimeout; } /** * Set whether to use pure thrift IO for the <key,value> tuple mode. * * @param useThrift true if thrift IO is to be used */ public void setUseThriftIO(boolean useThrift) { m_useThriftIO = useThrift; } /** * Get whether to use pure thrift IO for the <key,value> tuple mode. * * @return true if thrift IO is to be used */ public boolean getUseThriftIO() { return m_useThriftIO; } /** * Set the cassandra node hostname to connect to * * @param host the host to connect to */ public void setCassandraHost(String host) { m_cassandraHost = host; } /** * Get the name of the cassandra node to connect to * * @return the name of the cassandra node to connect to */ public String getCassandraHost() { return m_cassandraHost; } /** * Set the port that cassandra is listening on * * @param port the port that cassandra is listening on */ public void setCassandraPort(String port) { m_cassandraPort = port; } /** * Get the port that cassandra is listening on * * @return the port that cassandra is listening on */ public String getCassandraPort() { return m_cassandraPort; } /** * Set the keyspace (db) to use * * @param keyspace the keyspace to use */ public void setCassandraKeyspace(String keyspace) { m_cassandraKeyspace = keyspace; } /** * Get the keyspace (db) to use * * @return the keyspace (db) to use */ public String getCassandraKeyspace() { return m_cassandraKeyspace; } /** * Set whether to compress (GZIP) CQL queries when transmitting them to the * server * * @param c true if CQL queries are to be compressed */ public void setUseCompression(boolean c) { m_useCompression = c; } /** * Get whether CQL queries will be compressed (GZIP) or not * * @return true if CQL queries will be compressed when sending to the server */ public boolean getUseCompression() { return m_useCompression; } /** * Set the CQL SELECT query to execute. * * @param query the query to execute */ public void setCQLSelectQuery(String query) { m_cqlSelectQuery = query; } /** * Get the CQL SELECT query to execute * * @return the query to execute */ public String getCQLSelectQuery() { return m_cqlSelectQuery; } /** * Set the username to authenticate with * * @param un the username to authenticate with */ public void setUsername(String un) { m_username = un; } /** * Get the username to authenticate with * * @return the username to authenticate with */ public String getUsername() { return m_username; } /** * Set the password to authenticate with * * @param pass the password to authenticate with */ public void setPassword(String pass) { m_password = pass; } /** * Get the password to authenticate with * * @return the password to authenticate with */ public String getPassword() { return m_password; } /** * Set whether to output key, column, timestamp tuples as rows rather than * standard row format. * * @param o true if tuples are to be output */ public void setOutputKeyValueTimestampTuples(boolean o) { m_outputKeyValueTimestampTuples = o; } /** * Get whether to output key, column, timestamp tuples as rows rather than * standard row format. * * @return true if tuples are to be output */ public boolean getOutputKeyValueTimestampTuples() { return m_outputKeyValueTimestampTuples; } @Override public String getXML() { StringBuffer retval = new StringBuffer(); if (!Const.isEmpty(m_cassandraHost)) { retval.append("\n ").append( XMLHandler.addTagValue("cassandra_host", m_cassandraHost)); } if (!Const.isEmpty(m_cassandraPort)) { retval.append("\n ").append( XMLHandler.addTagValue("cassandra_port", m_cassandraPort)); } if (!Const.isEmpty(m_username)) { retval.append("\n ").append( XMLHandler.addTagValue("username", m_username)); } if (!Const.isEmpty(m_password)) { retval.append("\n ").append( XMLHandler.addTagValue("password", Encr.encryptPasswordIfNotUsingVariables(m_password))); } if (!Const.isEmpty(m_cassandraKeyspace)) { retval.append("\n ").append( XMLHandler.addTagValue("cassandra_keyspace", m_cassandraKeyspace)); } retval.append("\n ").append( XMLHandler.addTagValue("use_compression", m_useCompression)); if (!Const.isEmpty(m_cqlSelectQuery)) { retval.append("\n ").append( XMLHandler.addTagValue("cql_select_query", m_cqlSelectQuery)); } retval.append("\n ").append( XMLHandler.addTagValue("output_key_value_timestamp_tuples", m_outputKeyValueTimestampTuples)); retval.append("\n ").append( XMLHandler.addTagValue("use_thrift_io", m_useThriftIO)); if (!Const.isEmpty(m_socketTimeout)) { retval.append("\n ").append( XMLHandler.addTagValue("socket_timeout", m_socketTimeout)); } return retval.toString(); } public void loadXML(Node stepnode, List<DatabaseMeta> databases, Map<String, Counter> counters) throws KettleXMLException { m_cassandraHost = XMLHandler.getTagValue(stepnode, "cassandra_host"); m_cassandraPort = XMLHandler.getTagValue(stepnode, "cassandra_port"); m_username = XMLHandler.getTagValue(stepnode, "username"); m_password = XMLHandler.getTagValue(stepnode, "password"); if (!Const.isEmpty(m_password)) { m_password = Encr.decryptPasswordOptionallyEncrypted(m_password); } m_cassandraKeyspace = XMLHandler .getTagValue(stepnode, "cassandra_keyspace"); m_cqlSelectQuery = XMLHandler.getTagValue(stepnode, "cql_select_query"); m_useCompression = XMLHandler.getTagValue(stepnode, "use_compression") .equalsIgnoreCase("Y"); String kV = XMLHandler.getTagValue(stepnode, "output_key_value_timestamp_tuples"); if (kV != null) { m_outputKeyValueTimestampTuples = kV.equalsIgnoreCase("Y"); } String thrift = XMLHandler.getTagValue(stepnode, "use_thrift_io"); if (thrift != null) { m_useThriftIO = thrift.equalsIgnoreCase("Y"); } m_socketTimeout = XMLHandler.getTagValue(stepnode, "socket_timeout"); } public void readRep(Repository rep, ObjectId id_step, List<DatabaseMeta> databases, Map<String, Counter> counters) throws KettleException { m_cassandraHost = rep.getStepAttributeString(id_step, 0, "cassandra_host"); m_cassandraPort = rep.getStepAttributeString(id_step, 0, "cassandra_port"); m_username = rep.getStepAttributeString(id_step, 0, "username"); m_password = rep.getStepAttributeString(id_step, 0, "password"); if (!Const.isEmpty(m_password)) { m_password = Encr.decryptPasswordOptionallyEncrypted(m_password); } m_cassandraKeyspace = rep.getStepAttributeString(id_step, 0, "cassandra_keyspace"); m_cqlSelectQuery = rep.getStepAttributeString(id_step, 0, "cql_select_query"); m_useCompression = rep.getStepAttributeBoolean(id_step, 0, "use_compression"); m_outputKeyValueTimestampTuples = rep.getStepAttributeBoolean(id_step, 0, "output_key_value_timestamp_tuples"); m_useThriftIO = rep.getStepAttributeBoolean(id_step, 0, "use_thrift_io"); m_socketTimeout = rep.getStepAttributeString(id_step, 0, "socket_timeout"); } public void saveRep(Repository rep, ObjectId id_transformation, ObjectId id_step) throws KettleException { if (!Const.isEmpty(m_cassandraHost)) { rep.saveStepAttribute(id_transformation, id_step, 0, "cassandra_host", m_cassandraHost); } if (!Const.isEmpty(m_cassandraPort)) { rep.saveStepAttribute(id_transformation, id_step, 0, "cassandra_port", m_cassandraPort); } if (!Const.isEmpty(m_username)) { rep.saveStepAttribute(id_transformation, id_step, 0, "username", m_username); } if (!Const.isEmpty(m_password)) { rep.saveStepAttribute(id_transformation, id_step, 0, "password", Encr.encryptPasswordIfNotUsingVariables(m_password)); } if (!Const.isEmpty(m_cassandraKeyspace)) { rep.saveStepAttribute(id_transformation, id_step, 0, "cassandra_keyspace", m_cassandraKeyspace); } rep.saveStepAttribute(id_transformation, id_step, 0, "use_compression", m_useCompression); if (!Const.isEmpty(m_cqlSelectQuery)) { rep.saveStepAttribute(id_transformation, id_step, 0, "cql_select_query", m_cqlSelectQuery); } rep.saveStepAttribute(id_transformation, id_step, 0, "output_key_value_timestamp_tuples", m_outputKeyValueTimestampTuples); rep.saveStepAttribute(id_transformation, id_step, 0, "use_thrift_io", m_useThriftIO); if (!Const.isEmpty(m_socketTimeout)) { rep.saveStepAttribute(id_transformation, id_step, 0, "socket_timeout", m_socketTimeout); } } public void check(List<CheckResultInterface> remarks, TransMeta transMeta, StepMeta stepMeta, RowMetaInterface prev, String[] input, String[] output, RowMetaInterface info) { // TODO Auto-generated method stub } public StepInterface getStep(StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta, Trans trans) { return new CassandraInput(stepMeta, stepDataInterface, copyNr, transMeta, trans); } public StepDataInterface getStepData() { return new CassandraInputData(); } public void setDefault() { m_cassandraHost = "localhost"; m_cassandraPort = "9160"; m_cqlSelectQuery = "SELECT <fields> FROM <column family> WHERE <condition>;"; m_useCompression = false; m_socketTimeout = ""; } @Override public void getFields(RowMetaInterface rowMeta, String origin, RowMetaInterface[] info, StepMeta nextStep, VariableSpace space) throws KettleStepException { m_specificCols = null; m_rowLimit = -1; m_colLimit = -1; rowMeta.clear(); // start afresh - eats the input if (Const.isEmpty(m_cassandraKeyspace)) { // no keyspace! return; } String colFamName = null; if (!Const.isEmpty(m_cqlSelectQuery)) { String subQ = space.environmentSubstitute(m_cqlSelectQuery); if (!subQ.toLowerCase().startsWith("select")) { // not a select statement! logError(BaseMessages.getString(PKG, "CassandraInput.Error.NoSelectInQuery")); return; } if (subQ.indexOf(';') < 0) { // query must end with a ';' or it will wait for more! logError(BaseMessages.getString(PKG, "CassandraInput.Error.QueryTermination")); return; } // is there a LIMIT clause? if (subQ.toLowerCase().indexOf("limit") > 0) { String limitS = subQ.toLowerCase() .substring(subQ.toLowerCase().indexOf("limit") + 5, subQ.length()) .trim(); limitS = limitS.replaceAll(";", ""); try { m_rowLimit = Integer.parseInt(limitS); } catch (NumberFormatException ex) { logError(BaseMessages .getString(PKG, "CassandraInput.Error.UnableToParseLimitClause", m_cqlSelectQuery)); m_rowLimit = 10000; } } // strip off where clause (if any) if (subQ.toLowerCase().lastIndexOf("where") > 0) { subQ = subQ.substring(0, subQ.toLowerCase().lastIndexOf("where")); } // first determine the source column family // look for a FROM that is surrounded by space int fromIndex = subQ.toLowerCase().indexOf("from"); String tempS = subQ.toLowerCase(); int offset = fromIndex; while (fromIndex > 0 && tempS.charAt(fromIndex - 1) != ' ' && (fromIndex + 4 < tempS.length()) && tempS.charAt(fromIndex + 4) != ' ') { tempS = tempS.substring(fromIndex + 4, tempS.length()); fromIndex = tempS.indexOf("from"); offset += (4 + fromIndex); } fromIndex = offset; if (fromIndex < 0) { logError(BaseMessages.getString(PKG, "CassandraInput.Error.MustSpecifyAColumnFamily")); return; // no from clause } colFamName = subQ.substring(fromIndex + 4, subQ.length()).trim(); if (colFamName.indexOf(' ') > 0) { colFamName = colFamName.substring(0, colFamName.indexOf(' ')); } else { colFamName = colFamName.replace(";", ""); } if (colFamName.length() == 0) { return; // no column family specified } // is there a FIRST clause? if (subQ.toLowerCase().indexOf("first") > 0) { String firstS = subQ.toLowerCase() .substring(subQ.toLowerCase().indexOf("first") + 5, subQ.length()) .trim(); firstS = firstS.substring(0, firstS.indexOf(' ')); try { m_colLimit = Integer.parseInt(firstS); } catch (NumberFormatException ex) { logError(BaseMessages .getString(PKG, "CassandraInput.Error.UnableToParseFirstClause", m_cqlSelectQuery)); return; } } // now determine if its a select */FIRST or specific set of columns String[] cols = null; if (subQ.indexOf("*") > 0) { // nothing special to do here m_isSelectStarQuery = true; } else { m_isSelectStarQuery = false; String colsS = subQ.substring(subQ.indexOf('\''), fromIndex); cols = colsS.split(","); } // try and connect to get meta data String hostS = space.environmentSubstitute(m_cassandraHost); String portS = space.environmentSubstitute(m_cassandraPort); String userS = m_username; String passS = m_password; if (!Const.isEmpty(userS) && !Const.isEmpty(passS)) { userS = space.environmentSubstitute(m_username); passS = space.environmentSubstitute(m_password); } String keyspaceS = space.environmentSubstitute(m_cassandraKeyspace); CassandraConnection conn = null; try { conn = CassandraInputData.getCassandraConnection(hostS, Integer.parseInt(portS), userS, passS); conn.setKeyspace(keyspaceS); } catch (Exception ex) { logError(ex.getMessage(), ex); return; } try { CassandraColumnMetaData colMeta = new CassandraColumnMetaData(conn, colFamName); // Do the key first ValueMetaInterface km = colMeta.getValueMetaForKey(); rowMeta.addValueMeta(km); if (getOutputKeyValueTimestampTuples()) { // special case where user has asked for all row keys, columns and // timestamps output as separate rows. ValueMetaInterface vm = new ValueMeta("ColumnName", ValueMetaInterface.TYPE_STRING); rowMeta.addValueMeta(vm); vm = null; String defaultColumnValidator = colMeta.getDefaultValidationClass(); if (!Const.isEmpty(defaultColumnValidator)) { if (defaultColumnValidator.indexOf('(') > 0) { defaultColumnValidator = defaultColumnValidator.substring(0, defaultColumnValidator.indexOf(')')); } if (defaultColumnValidator.endsWith("BytesType")) { vm = new ValueMeta("ColumnValue", ValueMeta.TYPE_BINARY); } } if (vm == null) { vm = new ValueMeta("ColumnValue", ValueMetaInterface.TYPE_STRING); } rowMeta.addValueMeta(vm); vm = new ValueMeta("Timestamp", ValueMetaInterface.TYPE_INTEGER); rowMeta.addValueMeta(vm); conn.close(); // specific columns requested if (cols != null) { m_specificCols = new ArrayList<String>(); for (String col : cols) { col = cleanseColName(col); m_specificCols.add(col); } } return; } if (cols == null) { // select * - use all the columns that are defined in the schema List<ValueMetaInterface> vms = colMeta.getValueMetasForSchema(); for (ValueMetaInterface vm : vms) { rowMeta.addValueMeta(vm); } } else { m_specificCols = new ArrayList<String>(); // do the individual columns for (String col : cols) { col = cleanseColName(col); if (!colMeta.columnExistsInSchema(col)) { // this one isn't known about in about in the schema - we can // output it // as long as its values satisfy the default validator... logBasic(BaseMessages.getString(PKG, "CassandraInput.Info.DefaultColumnValidator", col)); } ValueMetaInterface vm = colMeta.getValueMetaForColumn(col); rowMeta.addValueMeta(vm); } } } catch (Exception ex) { logBasic(BaseMessages.getString(PKG, "CassandraInput.Info.UnableToRetrieveColumnMetaData", colFamName), ex); return; } finally { if (conn != null) { conn.close(); } } } } private String cleanseColName(String col) { col = col.trim(); col = col.replace("'", ""); col = col.replace("\"", ""); return col; } /** * Get the UI for this step. * * @param shell a <code>Shell</code> value * @param meta a <code>StepMetaInterface</code> value * @param transMeta a <code>TransMeta</code> value * @param name a <code>String</code> value * @return a <code>StepDialogInterface</code> value */ public StepDialogInterface getDialog(Shell shell, StepMetaInterface meta, TransMeta transMeta, String name) { return new CassandraInputDialog(shell, meta, transMeta, name); } }