/* * Licensed to the Ted Dunning under one or more contributor license * agreements. See the NOTICE file that may be * distributed with this work for additional information * regarding copyright ownership. Ted Dunning licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package com.mapr.synth; import com.google.common.collect.Queues; import com.mapr.synth.distributions.LongTail; import com.mapr.synth.distributions.TermGenerator; import com.mapr.synth.distributions.WordGenerator; import org.apache.mahout.common.RandomUtils; import org.apache.mahout.math.jet.random.AbstractContinousDistribution; import org.apache.mahout.math.jet.random.Uniform; import org.apache.mahout.math.random.Sampler; import java.net.Inet4Address; import java.net.InetAddress; import java.net.UnknownHostException; import java.util.PriorityQueue; import java.util.Random; /** * Generates kind of realistic log lines consisting of a user id (a cookie), an IP address and a query. */ public class LogGenerator implements Sampler<LogLine> { private PriorityQueue<LogLine> eventBuffer = Queues.newPriorityQueue(); private PriorityQueue<User> users = Queues.newPriorityQueue(); private LongTail<InetAddress> ipGenerator = new LongTail<InetAddress>(1, 0.5) { Random gen = new Random(); @Override protected InetAddress createThing() { int address = gen.nextInt(); try { return Inet4Address.getByAddress(new byte[]{ (byte) (address >>> 24), (byte) (0xff & (address >>> 16)), (byte) (0xff & (address >>> 8)), (byte) (0xff & (address)) }); } catch (UnknownHostException e) { throw new RuntimeException("Can't happen with numeric IP address", e); } } }; private WordGenerator words = new WordGenerator("word-frequency-seed", "other-words"); private TermGenerator terms = new TermGenerator(words, 1, 0.8); private TermGenerator geo = new TermGenerator(new WordGenerator(null, "geo-codes"), 10, 0); // the average user visits once per day, but there is a LOT of variation between users private AbstractContinousDistribution sessionRateDistribution = new Uniform(1.0 / 24 / 3600, 1.0 / 24 / 3600, RandomUtils.getRandom()); public Iterable<User> getUsers() { return users; } public LogGenerator(int userCount) { for (int i = 0; i < userCount; i++) { users.add(new User(ipGenerator.sample(), geo.sample(), terms, sessionRateDistribution.nextDouble())); } } public LogLine sample() { LogLine firstEvent = eventBuffer.peek(); double t1 = firstEvent != null ? firstEvent.getT() : Double.POSITIVE_INFINITY; double t2 = users.peek().getNextSession(); while (t2 < t1) { User u = users.poll(); // generate a session u.session(eventBuffer); // user now has new time for next session users.add(u); // if u.session() schedules an event immediately, then this will never // allow another loop firstEvent = eventBuffer.peek(); t1 = firstEvent != null ? firstEvent.getT() : Double.POSITIVE_INFINITY; t2 = users.peek().getNextSession(); } return eventBuffer.poll(); } public int getUserCount() { return users.size(); } }