/* * Licensed to the Ted Dunning under one or more contributor license * agreements. See the NOTICE file that may be * distributed with this work for additional information * regarding copyright ownership. Ted Dunning licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package com.mapr.synth.samplers; import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.node.JsonNodeFactory; import com.fasterxml.jackson.databind.node.ObjectNode; import com.google.common.base.Function; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import org.apache.mahout.math.random.Sampler; import java.io.File; import java.io.IOException; import java.util.*; /** * Samples from a specified schema to generate reasonably interesting data. */ public class SchemaSampler implements Sampler<JsonNode> { private final JsonNodeFactory nodeFactory = JsonNodeFactory.withExactBigDecimals(false); private List<FieldSampler> schema; private List<String> fields; private Queue<JsonNode> buffer = new ArrayDeque<>(); public SchemaSampler(List<FieldSampler> s) { init(s); } public SchemaSampler(String schemaDefinition) throws IOException { ObjectMapper mapper = new ObjectMapper(); mapper.configure(JsonParser.Feature.ALLOW_COMMENTS, true); mapper.configure(JsonParser.Feature.ALLOW_UNQUOTED_FIELD_NAMES, true); mapper.configure(JsonParser.Feature.ALLOW_SINGLE_QUOTES, true); init(mapper.<List<FieldSampler>>readValue(schemaDefinition, new TypeReference<List<FieldSampler>>() { })); } public SchemaSampler(File input) throws IOException { ObjectMapper mapper = new ObjectMapper(); mapper.configure(JsonParser.Feature.ALLOW_COMMENTS, true); mapper.configure(JsonParser.Feature.ALLOW_UNQUOTED_FIELD_NAMES, true); mapper.configure(JsonParser.Feature.ALLOW_SINGLE_QUOTES, true); init(mapper.<List<FieldSampler>>readValue(input, new TypeReference<List<FieldSampler>>() { })); } public List<String> getFieldNames() { return fields; } private void init(List<FieldSampler> s) { schema = s; fields = Lists.transform(schema, new Function<FieldSampler, String>() { @Override public String apply(FieldSampler input) { return input.getName(); } }); } @Override public JsonNode sample() { // we may have buffered records JsonNode x = buffer.poll(); while (x == null) { // nothing buffered ... generate some data Map<String, JsonNode> generators = Maps.newTreeMap(); ObjectNode r = nodeFactory.objectNode(); Iterator<String> fx = fields.iterator(); for (FieldSampler s : schema) { String fieldName = fx.next(); if (s.isFlat()) { // this sampler either generates an object or an array JsonNode v = s.sample(); if (v.isObject()) { // an object just produces multiple fields in a single record for (Iterator<String> it = v.fieldNames(); it.hasNext(); ) { String key = it.next(); r.set(key, v.get(key)); } } else if (v.isArray()) { // an array causes records to be buffered generators.put(fieldName, v); } else { r.set(fieldName, v); } } else { r.set(fieldName, s.sample()); } } // at this point r has all non generator fields if (generators.size() > 0) { // here we have to handle the case of more than one generator crossProduct(buffer, r, Lists.newArrayList(generators.keySet()), generators, 0); // the generators may or may not have actually generated anything // but that will just cause us to go once more around the circle x = buffer.poll(); } else { // with no array generators, we can short-circuit the process x = r; } } // yes, there was a buffered record return x; } private void crossProduct(Queue<JsonNode> buffer, ObjectNode r, List<String> fields, Map<String, JsonNode> generators, int currentFieldIndex) { if (currentFieldIndex < fields.size()) { // get this generator JsonNode values = generators.get(fields.get(currentFieldIndex)); int n = values.size(); // and for each value it has for (int j = 0; j < n; j++) { // set that field ... r.set(fields.get(currentFieldIndex), values.get(j)); // and recurse crossProduct(buffer, r, fields, generators, currentFieldIndex + 1); } } else { // when we bottom out we add a copied record to the buffer buffer.add(r.deepCopy()); } } }