* Licensed to the Ted Dunning under one or more contributor license
* agreements. See the NOTICE file that may be
* distributed with this work for additional information
* regarding copyright ownership. Ted Dunning licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
package com.mapr.synth.samplers;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.JsonNodeFactory;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.google.common.base.Function;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.mahout.math.random.Sampler;
import java.io.File;
import java.io.IOException;
import java.util.*;
* Samples from a specified schema to generate reasonably interesting data.
public class SchemaSampler implements Sampler<JsonNode> {
private final JsonNodeFactory nodeFactory = JsonNodeFactory.withExactBigDecimals(false);
private List<FieldSampler> schema;
private List<String> fields;
private Queue<JsonNode> buffer = new ArrayDeque<>();
public SchemaSampler(List<FieldSampler> s) {
public SchemaSampler(String schemaDefinition) throws IOException {
ObjectMapper mapper = new ObjectMapper();
mapper.configure(JsonParser.Feature.ALLOW_COMMENTS, true);
mapper.configure(JsonParser.Feature.ALLOW_UNQUOTED_FIELD_NAMES, true);
mapper.configure(JsonParser.Feature.ALLOW_SINGLE_QUOTES, true);
init(mapper.<List<FieldSampler>>readValue(schemaDefinition, new TypeReference<List<FieldSampler>>() {
public SchemaSampler(File input) throws IOException {
ObjectMapper mapper = new ObjectMapper();
mapper.configure(JsonParser.Feature.ALLOW_COMMENTS, true);
mapper.configure(JsonParser.Feature.ALLOW_UNQUOTED_FIELD_NAMES, true);
mapper.configure(JsonParser.Feature.ALLOW_SINGLE_QUOTES, true);
init(mapper.<List<FieldSampler>>readValue(input, new TypeReference<List<FieldSampler>>() {
public List<String> getFieldNames() {
return fields;
private void init(List<FieldSampler> s) {
schema = s;
fields = Lists.transform(schema, new Function<FieldSampler, String>() {
public String apply(FieldSampler input) {
return input.getName();
public JsonNode sample() {
// we may have buffered records
JsonNode x = buffer.poll();
while (x == null) {
// nothing buffered ... generate some data
Map<String, JsonNode> generators = Maps.newTreeMap();
ObjectNode r = nodeFactory.objectNode();
Iterator<String> fx = fields.iterator();
for (FieldSampler s : schema) {
String fieldName = fx.next();
if (s.isFlat()) {
// this sampler either generates an object or an array
JsonNode v = s.sample();
if (v.isObject()) {
// an object just produces multiple fields in a single record
for (Iterator<String> it = v.fieldNames(); it.hasNext(); ) {
String key = it.next();
r.set(key, v.get(key));
} else if (v.isArray()) {
// an array causes records to be buffered
generators.put(fieldName, v);
} else {
r.set(fieldName, v);
} else {
r.set(fieldName, s.sample());
// at this point r has all non generator fields
if (generators.size() > 0) {
// here we have to handle the case of more than one generator
crossProduct(buffer, r, Lists.newArrayList(generators.keySet()), generators, 0);
// the generators may or may not have actually generated anything
// but that will just cause us to go once more around the circle
x = buffer.poll();
} else {
// with no array generators, we can short-circuit the process
x = r;
// yes, there was a buffered record
return x;
private void crossProduct(Queue<JsonNode> buffer, ObjectNode r, List<String> fields, Map<String, JsonNode> generators, int currentFieldIndex) {
if (currentFieldIndex < fields.size()) {
// get this generator
JsonNode values = generators.get(fields.get(currentFieldIndex));
int n = values.size();
// and for each value it has
for (int j = 0; j < n; j++) {
// set that field ...
r.set(fields.get(currentFieldIndex), values.get(j));
// and recurse
crossProduct(buffer, r, fields, generators, currentFieldIndex + 1);
} else {
// when we bottom out we add a copied record to the buffer