/** * Copyright 2015 StreamSets Inc. * * Licensed under the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.streamsets.pipeline.lib.hashing; import com.google.common.hash.Funnel; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import com.google.common.hash.PrimitiveSink; import com.streamsets.pipeline.api.Field; import com.streamsets.pipeline.api.Record; import com.streamsets.pipeline.api.impl.Utils; import com.streamsets.pipeline.lib.util.FieldRegexUtil; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.List; import java.util.Set; /** * This is a refactored code for hasing using Guavas Library which is currently used by FieldHasher * and DedupProcessor. */ public final class HashingUtil { public enum HashType { MURMUR3_128, MURMUR3_32, SIPHASH24, MD5, SHA1, SHA256, SHA512, ADLER32, CRC32, CRC32C, ; } public static HashFunction getHasher(HashType hashType) { switch(hashType) { case MURMUR3_128: return Hashing.murmur3_128(); case MURMUR3_32: return Hashing.murmur3_32(); case SIPHASH24: return Hashing.sipHash24(); case MD5: return Hashing.md5(); case SHA1: return Hashing.sha1(); case SHA256: return Hashing.sha256(); case SHA512: return Hashing.sha512(); case ADLER32: return Hashing.adler32(); case CRC32: return Hashing.crc32(); case CRC32C: return Hashing.crc32c(); default: throw new IllegalArgumentException(Utils.format("Unsupported Hashing Algorithm: {}", hashType.name())); } } public static RecordFunnel getRecordFunnel(Collection<String> fieldsToHash) { if(fieldsToHash == null || fieldsToHash.isEmpty()) { return new RecordFunnel(); } return new RecordFunnel(fieldsToHash, false); } public static RecordFunnel getRecordFunnel( Collection<String> fieldsToHash, boolean includeRecordHeader) { if(fieldsToHash == null || fieldsToHash.isEmpty()) { return new RecordFunnel(); } return new RecordFunnel(fieldsToHash, includeRecordHeader); } public static class RecordFunnel implements Funnel<Record> { private Collection<String> fieldsToHash = null; private boolean includeRecordHeader = false; public RecordFunnel() { } public RecordFunnel(Collection<String> fieldsToHash, boolean includeRecordHeader) { this.fieldsToHash = fieldsToHash; this.includeRecordHeader = includeRecordHeader; } protected List<String> getFieldsToHash(Record record) { Set<String> fieldPaths = record.getEscapedFieldPaths(); List<String> fields = new ArrayList<>(); if (fieldsToHash != null) { for(String field : fieldsToHash) { List<String> matchingFieldPaths = FieldRegexUtil.getMatchingFieldPaths(field, fieldPaths); Collections.sort(matchingFieldPaths); fields.addAll(matchingFieldPaths); } } else { fields = new ArrayList<>(record.getEscapedFieldPaths()); Collections.sort(fields); } return fields; } @Override public void funnel(Record record, PrimitiveSink sink) { for (String path : getFieldsToHash(record)) { Field field = record.get(path); if (field == null) { throw new IllegalArgumentException( Utils.format("Field Path {} does not exist in the record", path) ); } if (field.getValue() != null) { switch (field.getType()) { case BOOLEAN: sink.putBoolean(field.getValueAsBoolean()); break; case CHAR: sink.putChar(field.getValueAsChar()); break; case BYTE: sink.putByte(field.getValueAsByte()); break; case SHORT: sink.putShort(field.getValueAsShort()); break; case INTEGER: sink.putInt(field.getValueAsInteger()); break; case LONG: sink.putLong(field.getValueAsLong()); break; case FLOAT: sink.putFloat(field.getValueAsFloat()); break; case DOUBLE: sink.putDouble(field.getValueAsDouble()); break; case DATE: sink.putLong(field.getValueAsDate().getTime()); break; case TIME: sink.putLong(field.getValueAsTime().getTime()); break; case DATETIME: sink.putLong(field.getValueAsDatetime().getTime()); break; case DECIMAL: sink.putString(field.getValueAsString(), Charset.defaultCharset()); break; case STRING: sink.putString(field.getValueAsString(), Charset.defaultCharset()); break; case BYTE_ARRAY: sink.putBytes(field.getValueAsByteArray()); break; case FILE_REF: throw new IllegalStateException( Utils.format( "Hashing not supported for field: {} of type {}", path, field.getType() ) ); default: break; } } else { sink.putBoolean(true); } sink.putByte((byte)0); } if (this.includeRecordHeader) { for (String attrName : record.getHeader().getAttributeNames()) { String headerAttr = record.getHeader().getAttribute(attrName); if (headerAttr != null) { sink.putString(headerAttr, Charset.defaultCharset()); } else { sink.putBoolean(true); } sink.putByte((byte)0); } } } } }