KafkaMusicExample.java example

Explorer

examples-master
- kafka-clients
  - consumer
    - src
      - main
        java
        io
        confluent
        examples
        consumer
        ConsumerGroupExample.java
        ConsumerLogic.java
        SimpleConsumerExample.java
  - producer
    - src
      - main
        java
        io
        confluent
        examples
        producer
        ProducerExample.java
  - specific-avro-consumer
    - src
      - main
        java
        io
        confluent
        examples
        consumer
        AvroClicksSessionizer.java
        SessionState.java
  - specific-avro-producer
    - src
      - main
        java
        io
        confluent
        examples
        producer
        AvroClicksProducer.java
        EventGenerator.java
- kafka-streams
  - src
    - main
      - java
        io
        confluent
        examples
        streams
        AnomalyDetectionLambdaExample.java
        ApplicationResetExample.java
        GlobalKTablesExample.java
        GlobalKTablesExampleDriver.java
        MapFunctionLambdaExample.java
        PageViewRegionExample.java
        PageViewRegionExampleDriver.java
        PageViewRegionLambdaExample.java
        SecureKafkaStreamsExample.java
        SessionWindowsExample.java
        SessionWindowsExampleDriver.java
        SumLambdaExample.java
        SumLambdaExampleDriver.java
        TopArticlesExampleDriver.java
        TopArticlesLambdaExample.java
        UserRegionLambdaExample.java
        WikipediaFeedAvroExample.java
        WikipediaFeedAvroExampleDriver.java
        WikipediaFeedAvroLambdaExample.java
        WordCountLambdaExample.java
        interactivequeries
        HostStoreInfo.java
        KeyValueBean.java
        MetadataService.java
        WordCountInteractiveQueriesDriver.java
        WordCountInteractiveQueriesExample.java
        WordCountInteractiveQueriesRestService.java
        kafkamusic
        KafkaMusicExample.java
        KafkaMusicExampleDriver.java
        MusicPlaysRestService.java
        SongBean.java
        SongPlayCountBean.java
        utils
        PriorityQueueDeserializer.java
        PriorityQueueSerde.java
        PriorityQueueSerializer.java
        WindowedSerde.java
        WindowedStringDeserializer.java
    - test
      - java
        io
        confluent
        examples
        streams
        ApplicationResetIntegrationTest.java
        EventDeduplicationLambdaIntegrationTest.java
        FanoutLambdaIntegrationTest.java
        GenericAvroIntegrationTest.java
        GlobalKTablesExampleTest.java
        HandlingCorruptedInputRecordsIntegrationTest.java
        IntegrationTestUtils.java
        JoinLambdaIntegrationTest.java
        MapFunctionLambdaIntegrationTest.java
        MixAndMatchLambdaIntegrationTest.java
        PassThroughIntegrationTest.java
        SessionWindowsExampleTest.java
        SpecificAvroIntegrationTest.java
        StateStoresInTheDSLIntegrationTest.java
        StreamToStreamJoinIntegrationTest.java
        StreamToTableJoinIntegrationTest.java
        SumLambdaIntegrationTest.java
        TableToTableJoinIntegrationTest.java
        TopArticlesLambdaExampleTest.java
        UserCountsPerRegionLambdaIntegrationTest.java
        ValidateStateWithInteractiveQueriesLambdaIntegrationTest.java
        WikipediaFeedAvroExampleTest.java
        WikipediaFeedAvroLambdaExampleTest.java
        WordCountLambdaIntegrationTest.java
        interactivequeries
        WordCountInteractiveQueriesExampleTest.java
        kafkamusic
        KafkaMusicExampleTest.java
        kafka
        EmbeddedSingleNodeKafkaCluster.java
        KafkaEmbedded.java
        utils
        PriorityQueueSerdeTest.java
        zookeeper
        ZooKeeperEmbedded.java

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.confluent.examples.streams.interactivequeries.kafkamusic;

import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.common.serialization.Deserializer;
import org.apache.kafka.common.serialization.Serde;
import org.apache.kafka.common.serialization.Serdes;
import org.apache.kafka.common.serialization.Serializer;
import org.apache.kafka.streams.KafkaStreams;
import org.apache.kafka.streams.KeyValue;
import org.apache.kafka.streams.StreamsConfig;
import org.apache.kafka.streams.kstream.KStream;
import org.apache.kafka.streams.kstream.KStreamBuilder;
import org.apache.kafka.streams.kstream.KTable;
import org.apache.kafka.streams.state.HostInfo;
import org.apache.kafka.streams.state.RocksDBConfigSetter;
import org.rocksdb.Options;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.TreeSet;

import io.confluent.examples.streams.avro.PlayEvent;
import io.confluent.examples.streams.avro.Song;
import io.confluent.examples.streams.avro.SongPlayCount;
import io.confluent.kafka.serializers.AbstractKafkaAvroSerDeConfig;
import io.confluent.kafka.streams.serdes.avro.SpecificAvroSerde;

/**
 * Demonstrates how to locate and query state stores (Interactive Queries).
 *
 * This application continuously computes the latest Top 5 music charts based on song play events
 * collected in real-time in a Kafka topic. This charts data is maintained in a continuously updated
 * state store that can be queried interactively via a REST API.
 *
 * Note: This example uses Java 8 functionality and thus works with Java 8+ only.  But of course you
 * can use the Interactive Queries feature of Kafka Streams also with Java 7.
 *
 * The topology in this example is modelled on a (very) simple streaming music service. It has 2
 * input topics: song-feed and play-events.
 *
 * The song-feed topic contains all of the songs available in the streaming service and is read
 * as a KTable with all songs being stored in the all-songs state store.
 *
 * The play-events topic is a feed of song plays. We filter the play events to only accept events
 * where the duration is >= 30 seconds. We then map the stream so that it is keyed by songId.
 *
 * Now that both streams are keyed the same we can join the play events with the songs, group by
 * the song and count them into a KTable, songPlayCounts, and a state store, song-play-count,
 * to keep track of the number of times each song has been played.
 *
 * Next, we group the songPlayCounts KTable by genre and aggregate into another KTable with the
 * state store, top-five-songs-by-genre, to track the top five songs by genre. Subsequently, we
 * group the same songPlayCounts KTable such that all song plays end up in the same partition. We
 * use this to aggregate the overall top five songs played into the state store, top-five.
 *
 * HOW TO RUN THIS EXAMPLE
 *
 * 1) Start Zookeeper, Kafka, and Confluent Schema Registry. Please refer to <a href='http://docs.confluent.io/current/quickstart.html#quickstart'>QuickStart</a>.
 *
 * 2) Create the input and output topics used by this example.
 *
 * <pre>
 * {@code
 * $ bin/kafka-topics --create --topic play-events \
 *                    --zookeeper localhost:2181 --partitions 4 --replication-factor 1
 * $ bin/kafka-topics --create --topic song-feed \
 *                    --zookeeper localhost:2181 --partitions 4 --replication-factor 1
 *
 * }
 * </pre>
 *
 * Note: The above commands are for the Confluent Platform. For Apache Kafka it should be
 * `bin/kafka-topics.sh ...`.
 *
 *
 * 3) Start two instances of this example application either in your IDE or on the command
 * line.
 *
 * If via the command line please refer to <a href='https://github.com/confluentinc/examples/tree/master/kafka-streams#packaging-and-running'>Packaging</a>.
 *
 * Once packaged you can then start the first instance of the application (on port 7070):
 *
 * <pre>
 * {@code
 * $ java -cp target/streams-examples-3.3.0-SNAPSHOT-standalone.jar \
 *      io.confluent.examples.streams.interactivequeries.kafkamusic.KafkaMusicExample 7070
 * }
 * </pre>
 *
 * Here, `7070` sets the port for the REST endpoint that will be used by this application instance.
 *
 * Then, in a separate terminal, run the second instance of this application (on port 7071):
 *
 * <pre>
 * {@code
 * $ java -cp target/streams-examples-3.3.0-SNAPSHOT-standalone.jar \
 *      io.confluent.examples.streams.interactivequeries.kafkamusic.KafkaMusicExample 7071
 * }
 * </pre>
 *
 *
 * 4) Write some input data to the source topics (e.g. via {@link KafkaMusicExampleDriver}). The
 * already running example application (step 3) will automatically process this input data
 *
 *
 * 5) Use your browser to hit the REST endpoint of the app instance you started in step 3 to query
 * the state managed by this application.  Note: If you are running multiple app instances, you can
 * query them arbitrarily -- if an app instance cannot satisfy a query itself, it will fetch the
 * results from the other instances.
 *
 * For example:
 *
 * <pre>
 * {@code
 * # List all running instances of this application
 * http://localhost:7070/kafka-music/instances
 *
 * # List app instances that currently manage (parts of) state store "song-play-count"
 * http://localhost:7070/kafka-music/instances/song-play-count
 *
 * # Get the latest top five for the genre "punk"
 * http://localhost:7070/kafka-music/charts/genre/punk
 *
 * # Get the latest top five across all genres
 * http://localhost:7070/kafka-music/charts/top-five
 * }
 * </pre>
 *
 * Note: that the REST functionality is NOT part of Kafka Streams or its API. For demonstration
 * purposes of this example application, we decided to go with a simple, custom-built REST API that
 * uses the Interactive Queries API of Kafka Streams behind the scenes to expose the state stores of
 * this application via REST.
 *
 * 6) Once you're done with your experiments, you can stop this example via `Ctrl-C`.  If needed,
 * also stop the Schema Registry (`Ctrl-C`), the Kafka broker (`Ctrl-C`), and only then stop the ZooKeeper instance
 * (`Ctrl-C`).
 *
 * If you like you can run multiple instances of this example by passing in a different port. You
 * can then experiment with seeing how keys map to different instances etc.
 */

public class KafkaMusicExample {

  private static final Long MIN_CHARTABLE_DURATION = 30 * 1000L;
  private static final String SONG_PLAY_COUNT_STORE = "song-play-count";
  static final String PLAY_EVENTS = "play-events";
  static final String ALL_SONGS = "all-songs";
  static final String SONG_FEED = "song-feed";
  static final String TOP_FIVE_SONGS_BY_GENRE_STORE = "top-five-songs-by-genre";
  static final String TOP_FIVE_SONGS_STORE = "top-five-songs";
  static final String TOP_FIVE_KEY = "all";

  private static final String DEFAULT_REST_ENDPOINT_HOSTNAME = "localhost";
  private static final String DEFAULT_BOOTSTRAP_SERVERS = "localhost:9092";
  private static final String DEFAULT_SCHEMA_REGISTRY_URL = "http://localhost:8081";

  public static class CustomRocksDBConfig implements RocksDBConfigSetter {

    @Override
    public void setConfig(final String storeName, final Options options, final Map<String, Object> configs) {
      // Workaround: We must ensure that the parallelism is set to >= 2.  There seems to be a known
      // issue with RocksDB where explicitly setting the parallelism to 1 causes issues (even though
      // 1 seems to be RocksDB's default for this configuration).
      int compactionParallelism = Math.max(Runtime.getRuntime().availableProcessors(), 2);
      // Set number of compaction threads (but not flush threads).
      options.setIncreaseParallelism(compactionParallelism);
    }
  }

  public static void main(String[] args) throws Exception {
    if (args.length == 0 || args.length > 4) {
      throw new IllegalArgumentException("usage: ... <portForRestEndpoint> " +
          "[<bootstrap.servers> (optional, default: " + DEFAULT_BOOTSTRAP_SERVERS + ")] " +
          "[<schema.registry.url> (optional, default: " + DEFAULT_SCHEMA_REGISTRY_URL + ")] " +
          "[<hostnameForRestEndPoint> (optional, default: " + DEFAULT_REST_ENDPOINT_HOSTNAME + ")]");
    }
    final int restEndpointPort = Integer.valueOf(args[0]);
    final String bootstrapServers = args.length > 1 ? args[1] : "localhost:9092";
    final String schemaRegistryUrl = args.length > 2 ? args[2] : "http://localhost:8081";
    final String restEndpointHostname = args.length > 3 ? args[3] : DEFAULT_REST_ENDPOINT_HOSTNAME;
    final HostInfo restEndpoint = new HostInfo(restEndpointHostname, restEndpointPort);

    System.out.println("Connecting to Kafka cluster via bootstrap servers " + bootstrapServers);
    System.out.println("Connecting to Confluent schema registry at " + schemaRegistryUrl);
    System.out.println("REST endpoint at http://" + restEndpointHostname + ":" + restEndpointPort);

    final KafkaStreams streams = createChartsStreams(bootstrapServers,
                                                     schemaRegistryUrl,
                                                     restEndpointPort,
                                                     "/tmp/kafka-streams");

    // Always (and unconditionally) clean local state prior to starting the processing topology.
    // We opt for this unconditional call here because this will make it easier for you to play around with the example
    // when resetting the application for doing a re-run (via the Application Reset Tool,
    // http://docs.confluent.io/current/streams/developer-guide.html#application-reset-tool).
    //
    // The drawback of cleaning up local state prior is that your app must rebuilt its local state from scratch, which
    // will take time and will require reading all the state-relevant data from the Kafka cluster over the network.
    // Thus in a production scenario you typically do not want to clean up always as we do here but rather only when it
    // is truly needed, i.e., only under certain conditions (e.g., the presence of a command line flag for your app).
    // See `ApplicationResetExample.java` for a production-like example.
    streams.cleanUp();

    // Now that we have finished the definition of the processing topology we can actually run
    // it via `start()`.  The Streams application as a whole can be launched just like any
    // normal Java application that has a `main()` method.
    streams.start();

    // Start the Restful proxy for servicing remote access to state stores
    final MusicPlaysRestService restService = startRestProxy(streams, restEndpoint);

    // Add shutdown hook to respond to SIGTERM and gracefully close Kafka Streams
    Runtime.getRuntime().addShutdownHook(new Thread(() -> {
      try {
        restService.stop();
        streams.close();
      } catch (Exception e) {
        // ignored
      }
    }));
  }

  static MusicPlaysRestService startRestProxy(final KafkaStreams streams, final HostInfo hostInfo)
      throws Exception {
    final MusicPlaysRestService
        interactiveQueriesRestService = new MusicPlaysRestService(streams, hostInfo);
    interactiveQueriesRestService.start();
    return interactiveQueriesRestService;
  }

  static KafkaStreams createChartsStreams(final String bootstrapServers,
                                          final String schemaRegistryUrl,
                                          final int applicationServerPort,
                                          final String stateDir) {
    final Properties streamsConfiguration = new Properties();
    // Give the Streams application a unique name.  The name must be unique in the Kafka cluster
    // against which the application is run.
    streamsConfiguration.put(StreamsConfig.APPLICATION_ID_CONFIG, "kafka-music-charts");
    // Where to find Kafka broker(s).
    streamsConfiguration.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers);
    // Provide the details of our embedded http service that we'll use to connect to this streams
    // instance and discover locations of stores.
    streamsConfiguration.put(StreamsConfig.APPLICATION_SERVER_CONFIG, "localhost:" + applicationServerPort);
    streamsConfiguration.put(StreamsConfig.STATE_DIR_CONFIG, stateDir);
    // Set to earliest so we don't miss any data that arrived in the topics before the process
    // started
    streamsConfiguration.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
    // Set the commit interval to 500ms so that any changes are flushed frequently and the top five
    // charts are updated with low latency.
    streamsConfiguration.put(StreamsConfig.COMMIT_INTERVAL_MS_CONFIG, 500);
    // Allow the user to fine-tune the `metadata.max.age.ms` via Java system properties from the CLI.
    // Lowering this parameter from its default of 5 minutes to a few seconds is helpful in
    // situations where the input topic was not pre-created before running the application because
    // the application will discover a newly created topic faster.  In production, you would
    // typically not change this parameter from its default.
    String metadataMaxAgeMs = System.getProperty(ConsumerConfig.METADATA_MAX_AGE_CONFIG);
    if (metadataMaxAgeMs != null) {
      try {
        int value = Integer.parseInt(metadataMaxAgeMs);
        streamsConfiguration.put(ConsumerConfig.METADATA_MAX_AGE_CONFIG, value);
        System.out.println("Set consumer configuration " + ConsumerConfig.METADATA_MAX_AGE_CONFIG +
            " to " + value);
      } catch (NumberFormatException ignored) {
      }
    }

    // create and configure the SpecificAvroSerdes required in this example
    final Map<String, String> serdeConfig = Collections.singletonMap(
        AbstractKafkaAvroSerDeConfig.SCHEMA_REGISTRY_URL_CONFIG, schemaRegistryUrl);

    final SpecificAvroSerde<PlayEvent> playEventSerde = new SpecificAvroSerde<>();
    playEventSerde.configure(serdeConfig, false);

    final SpecificAvroSerde<Song> keySongSerde = new SpecificAvroSerde<>();
    keySongSerde.configure(serdeConfig, true);

    final SpecificAvroSerde<Song> valueSongSerde = new SpecificAvroSerde<>();
    valueSongSerde.configure(serdeConfig, false);

    final SpecificAvroSerde<SongPlayCount> songPlayCountSerde = new SpecificAvroSerde<>();
    songPlayCountSerde.configure(serdeConfig, false);

    final KStreamBuilder builder = new KStreamBuilder();

    // get a stream of play events
    final KStream<String, PlayEvent> playEvents = builder.stream(Serdes.String(),
        playEventSerde,
        PLAY_EVENTS);

    // get table and create a state store to hold all the songs in the store
    final KTable<Long, Song>
        songTable =
        builder.table(Serdes.Long(), valueSongSerde, SONG_FEED, ALL_SONGS);

    // Accept play events that have a duration >= the minimum
    final KStream<Long, PlayEvent> playsBySongId =
        playEvents.filter((region, event) -> event.getDuration() >= MIN_CHARTABLE_DURATION)
            // repartition based on song id
            .map((key, value) -> KeyValue.pair(value.getSongId(), value));


    // join the plays with song as we will use it later for charting
    final KStream<Long, Song> songPlays = playsBySongId.leftJoin(songTable,
        (value1, song) -> song,
        Serdes.Long(),
        playEventSerde);

    // create a state store to track song play counts
    final KTable<Song, Long> songPlayCounts = songPlays.groupBy((songId, song) -> song, keySongSerde, valueSongSerde)
        .count(SONG_PLAY_COUNT_STORE);

    final TopFiveSerde topFiveSerde = new TopFiveSerde();


    // Compute the top five charts for each genre. The results of this computation will continuously update the state
    // store "top-five-songs-by-genre", and this state store can then be queried interactively via a REST API (cf.
    // MusicPlaysRestService) for the latest charts per genre.
    songPlayCounts.groupBy((song, plays) ->
            KeyValue.pair(song.getGenre().toLowerCase(),
                new SongPlayCount(song.getId(), plays)),
        Serdes.String(),
        songPlayCountSerde)
        // aggregate into a TopFiveSongs instance that will keep track
        // of the current top five for each genre. The data will be available in the
        // top-five-songs-genre store
        .aggregate(TopFiveSongs::new,
            (aggKey, value, aggregate) -> {
              aggregate.add(value);
              return aggregate;
            },
            (aggKey, value, aggregate) -> {
              aggregate.remove(value);
              return aggregate;
            },
            topFiveSerde,
            TOP_FIVE_SONGS_BY_GENRE_STORE
        );

    // Compute the top five chart. The results of this computation will continuously update the state
    // store "top-five-songs", and this state store can then be queried interactively via a REST API (cf.
    // MusicPlaysRestService) for the latest charts per genre.
    songPlayCounts.groupBy((song, plays) ->
            KeyValue.pair(TOP_FIVE_KEY,
                new SongPlayCount(song.getId(), plays)),
        Serdes.String(),
        songPlayCountSerde)
        .aggregate(TopFiveSongs::new,
            (aggKey, value, aggregate) -> {
              aggregate.add(value);
              return aggregate;
            },
            (aggKey, value, aggregate) -> {
              aggregate.remove(value);
              return aggregate;
            },
            topFiveSerde,
            TOP_FIVE_SONGS_STORE
        );

    return new KafkaStreams(builder, streamsConfiguration);

  }

  /**
   * Serde for TopFiveSongs
   */
  private static class TopFiveSerde implements Serde<TopFiveSongs> {

    @Override
    public void configure(final Map<String, ?> map, final boolean b) {

    }

    @Override
    public void close() {

    }

    @Override
    public Serializer<TopFiveSongs> serializer() {
      return new Serializer<TopFiveSongs>() {
        @Override
        public void configure(final Map<String, ?> map, final boolean b) {
        }

        @Override
        public byte[] serialize(final String s, final TopFiveSongs topFiveSongs) {
          final ByteArrayOutputStream out = new ByteArrayOutputStream();
          final DataOutputStream
              dataOutputStream =
              new DataOutputStream(out);
          try {
            for (SongPlayCount songPlayCount : topFiveSongs) {
                dataOutputStream.writeLong(songPlayCount.getSongId());
                dataOutputStream.writeLong(songPlayCount.getPlays());
            }
            dataOutputStream.flush();
          } catch (IOException e) {
            throw new RuntimeException(e);
          }
            return out.toByteArray();
        }

        @Override
        public void close() {

        }
      };
    }

    @Override
    public Deserializer<TopFiveSongs> deserializer() {
      return new Deserializer<TopFiveSongs>() {
        @Override
        public void configure(final Map<String, ?> map, final boolean b) {

        }

        @Override
        public TopFiveSongs deserialize(final String s, final byte[] bytes) {
          if (bytes == null || bytes.length == 0) {
            return null;
          }
          final TopFiveSongs result = new TopFiveSongs();

          final DataInputStream
              dataInputStream =
              new DataInputStream(new ByteArrayInputStream(bytes));

          try {
            while(dataInputStream.available() > 0) {
              result.add(new SongPlayCount(dataInputStream.readLong(),
                                           dataInputStream.readLong()));
            }
          } catch (IOException e) {
            throw new RuntimeException(e);
          }
          return result;
        }

        @Override
        public void close() {

        }
      };
    }
  }

  /**
   * Used in aggregations to keep track of the Top five songs
   */
  static class TopFiveSongs implements Iterable<SongPlayCount> {
    private final Map<Long, SongPlayCount> currentSongs = new HashMap<>();
    private final TreeSet<SongPlayCount> topFive = new TreeSet<>((o1, o2) -> {
      final int result = o2.getPlays().compareTo(o1.getPlays());
      if (result != 0) {
        return result;
      }
      return o1.getSongId().compareTo(o2.getSongId());
    });

    public void add(final SongPlayCount songPlayCount) {
      if(currentSongs.containsKey(songPlayCount.getSongId())) {
        topFive.remove(currentSongs.remove(songPlayCount.getSongId()));
      }
      topFive.add(songPlayCount);
      currentSongs.put(songPlayCount.getSongId(), songPlayCount);
      if (topFive.size() > 5) {
        final SongPlayCount last = topFive.last();
        currentSongs.remove(last.getSongId());
        topFive.remove(last);
      }
    }

    void remove(final SongPlayCount value) {
      topFive.remove(value);
      currentSongs.remove(value.getSongId());
    }


    @Override
    public Iterator<SongPlayCount> iterator() {
      return topFive.iterator();
    }
  }


}