HLogSplitter.java example

Explorer
SecureBase-master
- hbase-0.92.1
  - security
    - src
      - main
        java
        org
        apache
        hadoop
        hbase
        ipc
        SecureClient.java
        SecureConnectionHeader.java
        SecureRpcEngine.java
        SecureServer.java
        security
        AccessDeniedException.java
        HBasePolicyProvider.java
        HBaseSaslRpcClient.java
        HBaseSaslRpcServer.java
        access
        AccessControlFilter.java
        AccessControlLists.java
        AccessController.java
        AccessControllerProtocol.java
        Permission.java
        TableAuthManager.java
        TablePermission.java
        UserPermission.java
        ZKPermissionWatcher.java
        token
        AuthenticationKey.java
        AuthenticationProtocol.java
        AuthenticationTokenIdentifier.java
        AuthenticationTokenSecretManager.java
        AuthenticationTokenSelector.java
        TokenProvider.java
        TokenUtil.java
        ZKSecretWatcher.java
      - test
        java
        org
        apache
        hadoop
        hbase
        security
        access
        SecureTestUtil.java
        TestAccessControlFilter.java
        TestAccessController.java
        TestTablePermissions.java
        TestZKPermissionsWatcher.java
        token
        TestTokenAuthentication.java
        TestZKSecretWatcher.java
  - src
/**
 * Copyright 2010 The Apache Software Foundation
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.regionserver.wal;

import java.io.EOFException;
import java.io.IOException;
import java.io.InterruptedIOException;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.atomic.AtomicReference;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.RemoteExceptionHandler;
import org.apache.hadoop.hbase.io.HeapSize;
import org.apache.hadoop.hbase.monitoring.MonitoredTask;
import org.apache.hadoop.hbase.monitoring.TaskMonitor;
import org.apache.hadoop.hbase.regionserver.HRegion;
import org.apache.hadoop.hbase.regionserver.wal.HLog.Entry;
import org.apache.hadoop.hbase.regionserver.wal.HLog.Reader;
import org.apache.hadoop.hbase.regionserver.wal.HLog.Writer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.CancelableProgressable;
import org.apache.hadoop.hbase.util.ClassSize;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.zookeeper.ZKSplitLog;
import org.apache.hadoop.io.MultipleIOException;

import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;

/**
 * This class is responsible for splitting up a bunch of regionserver commit log
 * files that are no longer being written to, into new files, one per region for
 * region to replay on startup. Delete the old log files when finished.
 */
public class HLogSplitter {
  private static final String LOG_SPLITTER_IMPL = "hbase.hlog.splitter.impl";

  /**
   * Name of file that holds recovered edits written by the wal log splitting
   * code, one per region
   */
  public static final String RECOVERED_EDITS = "recovered.edits";


  static final Log LOG = LogFactory.getLog(HLogSplitter.class);

  private boolean hasSplit = false;
  private long splitTime = 0;
  private long splitSize = 0;


  // Parameters for split process
  protected final Path rootDir;
  protected final Path srcDir;
  protected final Path oldLogDir;
  protected final FileSystem fs;
  protected final Configuration conf;

  // Major subcomponents of the split process.
  // These are separated into inner classes to make testing easier.
  OutputSink outputSink;
  EntryBuffers entryBuffers;

  // If an exception is thrown by one of the other threads, it will be
  // stored here.
  protected AtomicReference<Throwable> thrown = new AtomicReference<Throwable>();

  // Wait/notify for when data has been produced by the reader thread,
  // consumed by the reader thread, or an exception occurred
  Object dataAvailable = new Object();
  
  private MonitoredTask status;


  /**
   * Create a new HLogSplitter using the given {@link Configuration} and the
   * <code>hbase.hlog.splitter.impl</code> property to derived the instance
   * class to use.
   * <p>
   * @param conf
   * @param rootDir hbase directory
   * @param srcDir logs directory
   * @param oldLogDir directory where processed logs are archived to
   * @param fs FileSystem
   * @return New HLogSplitter instance
   */
  public static HLogSplitter createLogSplitter(Configuration conf,
      final Path rootDir, final Path srcDir,
      Path oldLogDir, final FileSystem fs)  {

    @SuppressWarnings("unchecked")
    Class<? extends HLogSplitter> splitterClass = (Class<? extends HLogSplitter>) conf
        .getClass(LOG_SPLITTER_IMPL, HLogSplitter.class);
    try {
       Constructor<? extends HLogSplitter> constructor =
         splitterClass.getConstructor(
          Configuration.class, // conf
          Path.class, // rootDir
          Path.class, // srcDir
          Path.class, // oldLogDir
          FileSystem.class); // fs
      return constructor.newInstance(conf, rootDir, srcDir, oldLogDir, fs);
    } catch (IllegalArgumentException e) {
      throw new RuntimeException(e);
    } catch (InstantiationException e) {
      throw new RuntimeException(e);
    } catch (IllegalAccessException e) {
      throw new RuntimeException(e);
    } catch (InvocationTargetException e) {
      throw new RuntimeException(e);
    } catch (SecurityException e) {
      throw new RuntimeException(e);
    } catch (NoSuchMethodException e) {
      throw new RuntimeException(e);
    }
  }

  public HLogSplitter(Configuration conf, Path rootDir, Path srcDir,
      Path oldLogDir, FileSystem fs) {
    this.conf = conf;
    this.rootDir = rootDir;
    this.srcDir = srcDir;
    this.oldLogDir = oldLogDir;
    this.fs = fs;

    entryBuffers = new EntryBuffers(
        conf.getInt("hbase.regionserver.hlog.splitlog.buffersize",
            128*1024*1024));
    outputSink = new OutputSink();
  }

  /**
   * Split up a bunch of regionserver commit log files that are no longer being
   * written to, into new files, one per region for region to replay on startup.
   * Delete the old log files when finished.
   *
   * @throws IOException will throw if corrupted hlogs aren't tolerated
   * @return the list of splits
   */
  public List<Path> splitLog()
      throws IOException {
    Preconditions.checkState(!hasSplit,
        "An HLogSplitter instance may only be used once");
    hasSplit = true;

    status = TaskMonitor.get().createStatus(
        "Splitting logs in " + srcDir);
    
    long startTime = EnvironmentEdgeManager.currentTimeMillis();
    
    status.setStatus("Determining files to split...");
    List<Path> splits = null;
    if (!fs.exists(srcDir)) {
      // Nothing to do
      status.markComplete("No log directory existed to split.");
      return splits;
    }
    FileStatus[] logfiles = fs.listStatus(srcDir);
    if (logfiles == null || logfiles.length == 0) {
      // Nothing to do
      return splits;
    }
    logAndReport("Splitting " + logfiles.length + " hlog(s) in "
    + srcDir.toString());
    splits = splitLog(logfiles);

    splitTime = EnvironmentEdgeManager.currentTimeMillis() - startTime;
    String msg = "hlog file splitting completed in " + splitTime +
        " ms for " + srcDir.toString();
    status.markComplete(msg);
    LOG.info(msg);
    return splits;
  }
  
  private void logAndReport(String msg) {
    status.setStatus(msg);
    LOG.info(msg);
  }

  /**
   * @return time that this split took
   */
  public long getTime() {
    return this.splitTime;
  }

  /**
   * @return aggregate size of hlogs that were split
   */
  public long getSize() {
    return this.splitSize;
  }

  /**
   * @return a map from encoded region ID to the number of edits written out
   * for that region.
   */
  Map<byte[], Long> getOutputCounts() {
    Preconditions.checkState(hasSplit);
    return outputSink.getOutputCounts();
  }

  /**
   * Splits the HLog edits in the given list of logfiles (that are a mix of edits
   * on multiple regions) by region and then splits them per region directories,
   * in batches of (hbase.hlog.split.batch.size)
   * <p>
   * This process is split into multiple threads. In the main thread, we loop
   * through the logs to be split. For each log, we:
   * <ul>
   *   <li> Recover it (take and drop HDFS lease) to ensure no other process can write</li>
   *   <li> Read each edit (see {@link #parseHLog}</li>
   *   <li> Mark as "processed" or "corrupt" depending on outcome</li>
   * </ul>
   * <p>
   * Each edit is passed into the EntryBuffers instance, which takes care of
   * memory accounting and splitting the edits by region.
   * <p>
   * The OutputSink object then manages N other WriterThreads which pull chunks
   * of edits from EntryBuffers and write them to the output region directories.
   * <p>
   * After the process is complete, the log files are archived to a separate
   * directory.
   */
  private List<Path> splitLog(final FileStatus[] logfiles) throws IOException {
    List<Path> processedLogs = new ArrayList<Path>();
    List<Path> corruptedLogs = new ArrayList<Path>();
    List<Path> splits = null;

    boolean skipErrors = conf.getBoolean("hbase.hlog.split.skip.errors", true);

    countTotalBytes(logfiles);
    splitSize = 0;

    outputSink.startWriterThreads(entryBuffers);

    try {
      int i = 0;
      for (FileStatus log : logfiles) {
       Path logPath = log.getPath();
        long logLength = log.getLen();
        splitSize += logLength;
        logAndReport("Splitting hlog " + (i++ + 1) + " of " + logfiles.length
            + ": " + logPath + ", length=" + logLength);
        Reader in;
        try {
          in = getReader(fs, log, conf, skipErrors);
          if (in != null) {
            parseHLog(in, logPath, entryBuffers, fs, conf, skipErrors);
            try {
              in.close();
            } catch (IOException e) {
              LOG.warn("Close log reader threw exception -- continuing",
                  e);
            }
          }
          processedLogs.add(logPath);
        } catch (CorruptedLogFileException e) {
          LOG.info("Got while parsing hlog " + logPath +
              ". Marking as corrupted", e);
          corruptedLogs.add(logPath);
          continue;
        }
      }
      status.setStatus("Log splits complete. Checking for orphaned logs.");
      
      if (fs.listStatus(srcDir).length > processedLogs.size()
          + corruptedLogs.size()) {
        throw new OrphanHLogAfterSplitException(
            "Discovered orphan hlog after split. Maybe the "
            + "HRegionServer was not dead when we started");
      }
    } finally {
      status.setStatus("Finishing writing output logs and closing down.");
      splits = outputSink.finishWritingAndClose();
    }
    status.setStatus("Archiving logs after completed split");
    archiveLogs(srcDir, corruptedLogs, processedLogs, oldLogDir, fs, conf);
    return splits;
  }

  /**
   * @return the total size of the passed list of files.
   */
  private static long countTotalBytes(FileStatus[] logfiles) {
    long ret = 0;
    for (FileStatus stat : logfiles) {
      ret += stat.getLen();
    }
    return ret;
  }

  /**
   * Splits a HLog file into a temporary staging area. tmpname is used to build
   * the name of the staging area where the recovered-edits will be separated
   * out by region and stored.
   * <p>
   * If the log file has N regions then N recovered.edits files will be
   * produced. There is no buffering in this code. Instead it relies on the
   * buffering in the SequenceFileWriter.
   * <p>
   * @param rootDir
   * @param tmpname
   * @param logfile
   * @param fs
   * @param conf
   * @param reporter
   * @return false if it is interrupted by the progress-able.
   * @throws IOException
   */
  static public boolean splitLogFileToTemp(Path rootDir, String tmpname,
      FileStatus logfile, FileSystem fs,
      Configuration conf, CancelableProgressable reporter) throws IOException {
    HLogSplitter s = new HLogSplitter(conf, rootDir, null, null /* oldLogDir */,
        fs);
    return s.splitLogFileToTemp(logfile, tmpname, reporter);
  }

  public boolean splitLogFileToTemp(FileStatus logfile, String tmpname,
      CancelableProgressable reporter)  throws IOException {	    
    final Map<byte[], Object> logWriters = Collections.
    synchronizedMap(new TreeMap<byte[], Object>(Bytes.BYTES_COMPARATOR));
    boolean isCorrupted = false;
    
    Preconditions.checkState(status == null);
    status = TaskMonitor.get().createStatus(
        "Splitting log file " + logfile.getPath() +
        "into a temporary staging area.");

    Object BAD_WRITER = new Object();

    boolean progress_failed = false;

    boolean skipErrors = conf.getBoolean("hbase.hlog.split.skip.errors",
        HLog.SPLIT_SKIP_ERRORS_DEFAULT);
    int interval = conf.getInt("hbase.splitlog.report.interval.loglines", 1024);
    // How often to send a progress report (default 1/2 the zookeeper session
    // timeout of if that not set, the split log DEFAULT_TIMEOUT)
    int period = conf.getInt("hbase.splitlog.report.period",
      conf.getInt("hbase.splitlog.manager.timeout", ZKSplitLog.DEFAULT_TIMEOUT) / 2);
    int numOpenedFilesBeforeReporting =
      conf.getInt("hbase.splitlog.report.openedfiles", 3);
    Path logPath = logfile.getPath();
    long logLength = logfile.getLen();
    LOG.info("Splitting hlog: " + logPath + ", length=" + logLength);
    status.setStatus("Opening log file");
    Reader in = null;
    try {
      in = getReader(fs, logfile, conf, skipErrors);
    } catch (CorruptedLogFileException e) {
      LOG.warn("Could not get reader, corrupted log file " + logPath, e);
      ZKSplitLog.markCorrupted(rootDir, tmpname, fs);
      isCorrupted = true;
    }
    if (in == null) {
      status.markComplete("Was nothing to split in log file");
      LOG.warn("Nothing to split in log file " + logPath);
      return true;
    }
    long t = EnvironmentEdgeManager.currentTimeMillis();
    long last_report_at = t;
    if (reporter != null && reporter.progress() == false) {
      status.markComplete("Failed: reporter.progress asked us to terminate");
      return false;
    }
    // Report progress every so many edits and/or files opened (opening a file
    // takes a bit of time).
    int editsCount = 0;
    int numNewlyOpenedFiles = 0;
    Entry entry;
    try {
      while ((entry = getNextLogLine(in,logPath, skipErrors)) != null) {
        byte[] region = entry.getKey().getEncodedRegionName();
        Object o = logWriters.get(region);
        if (o == BAD_WRITER) {
          continue;
        }
        WriterAndPath wap = (WriterAndPath)o;
        if (wap == null) {
          wap = createWAP(region, entry, rootDir, tmpname, fs, conf);
          numNewlyOpenedFiles++;
          if (wap == null) {
            // ignore edits from this region. It doesn't exist anymore.
            // It was probably already split.
            logWriters.put(region, BAD_WRITER);
            continue;
          } else {
            logWriters.put(region, wap);
          }
        }
        wap.w.append(entry);
        editsCount++;
        // If sufficient edits have passed OR we've opened a few files, check if
        // we should report progress.
        if (editsCount % interval == 0 ||
            (numNewlyOpenedFiles > numOpenedFilesBeforeReporting)) {
          // Zero out files counter each time we fall in here.
          numNewlyOpenedFiles = 0;
          String countsStr = "edits=" + editsCount + ", files=" + logWriters.size();
          status.setStatus("Split " + countsStr);
          long t1 = EnvironmentEdgeManager.currentTimeMillis();
          if ((t1 - last_report_at) > period) {
            last_report_at = t;
            if (reporter != null && reporter.progress() == false) {
              status.markComplete("Failed: reporter.progress asked us to terminate; " + countsStr);
              progress_failed = true;
              return false;
            }
          }
        }
      }
    } catch (CorruptedLogFileException e) {
      LOG.warn("Could not parse, corrupted log file " + logPath, e);
      ZKSplitLog.markCorrupted(rootDir, tmpname, fs);
      isCorrupted = true;
    } catch (IOException e) {
      e = RemoteExceptionHandler.checkIOException(e);
      throw e;
    } finally {
      int n = 0;
      for (Object o : logWriters.values()) {
        long t1 = EnvironmentEdgeManager.currentTimeMillis();
        if ((t1 - last_report_at) > period) {
          last_report_at = t;
          if ((progress_failed == false) && (reporter != null) &&
              (reporter.progress() == false)) {
            progress_failed = true;
          }
        }
        if (o == BAD_WRITER) {
          continue;
        }
        n++;
        WriterAndPath wap = (WriterAndPath)o;
        wap.w.close();
        LOG.debug("Closed " + wap.p);
        Path dst = getCompletedRecoveredEditsFilePath(wap.p);
        if (!dst.equals(wap.p) && fs.exists(dst)) {
          LOG.warn("Found existing old edits file. It could be the "
              + "result of a previous failed split attempt. Deleting " + dst
              + ", length=" + fs.getFileStatus(dst).getLen());
          if (!fs.delete(dst, false)) {
            LOG.warn("Failed deleting of old " + dst);
            throw new IOException("Failed deleting of old " + dst);
          }
        }
        // Skip the unit tests which create a splitter that reads and writes the
        // data without touching disk. TestHLogSplit#testThreading is an
        // example.
        if (fs.exists(wap.p)) {
          if (!fs.rename(wap.p, dst)) {
            throw new IOException("Failed renaming " + wap.p + " to " + dst);
          }
        }
      }
      String msg = "Processed " + editsCount + " edits across " + n + " regions" +
        " threw away edits for " + (logWriters.size() - n) + " regions" +
        "; log file=" + logPath + " is corrupted=" + isCorrupted +
        " progress failed=" + progress_failed;
      LOG.info(msg);
      status.markComplete(msg);
    }
    return !progress_failed;
  }

  /**
   * Completes the work done by splitLogFileToTemp by moving the
   * recovered.edits from the staging area to the respective region server's
   * directories.
   * <p>
   * It is invoked by SplitLogManager once it knows that one of the
   * SplitLogWorkers have completed the splitLogFileToTemp() part. If the
   * master crashes then this function might get called multiple times.
   * <p>
   * @param tmpname
   * @param conf
   * @throws IOException
   */
  public static void moveRecoveredEditsFromTemp(String tmpname,
      String logfile, Configuration conf)
  throws IOException{
    Path rootdir = FSUtils.getRootDir(conf);
    Path oldLogDir = new Path(rootdir, HConstants.HREGION_OLDLOGDIR_NAME);
    moveRecoveredEditsFromTemp(tmpname, rootdir, oldLogDir, logfile, conf);
  }

  public static void moveRecoveredEditsFromTemp(String tmpname,
      Path rootdir, Path oldLogDir,
      String logfile, Configuration conf)
  throws IOException{
    List<Path> processedLogs = new ArrayList<Path>();
    List<Path> corruptedLogs = new ArrayList<Path>();
    FileSystem fs;
    fs = rootdir.getFileSystem(conf);
    Path logPath = new Path(logfile);
    if (ZKSplitLog.isCorrupted(rootdir, tmpname, fs)) {
      corruptedLogs.add(logPath);
    } else {
      processedLogs.add(logPath);
    }
    Path stagingDir = ZKSplitLog.getSplitLogDir(rootdir, tmpname);
    List<FileStatus> files = listAll(fs, stagingDir);
    for (FileStatus f : files) {
      Path src = f.getPath();
      Path dst = ZKSplitLog.stripSplitLogTempDir(rootdir, src);
      if (ZKSplitLog.isCorruptFlagFile(dst)) {
        continue;
      }
      if (fs.exists(src)) {
        if (fs.exists(dst)) {
          fs.delete(dst, false);
        } else {
          Path dstdir = dst.getParent();
          if (!fs.exists(dstdir)) {
            if (!fs.mkdirs(dstdir)) LOG.warn("mkdir failed on " + dstdir);
          }
        }
        fs.rename(src, dst);
        LOG.debug(" moved " + src + " => " + dst);
      } else {
        LOG.debug("Could not move recovered edits from " + src +
            " as it doesn't exist");
      }
    }
    archiveLogs(null, corruptedLogs, processedLogs,
        oldLogDir, fs, conf);
    fs.delete(stagingDir, true);
    return;
  }

  private static List<FileStatus> listAll(FileSystem fs, Path dir)
  throws IOException {
    List<FileStatus> fset = new ArrayList<FileStatus>(100);
    FileStatus [] files = fs.exists(dir)? fs.listStatus(dir): null;
    if (files != null) {
      for (FileStatus f : files) {
        if (f.isDir()) {
          fset.addAll(listAll(fs, f.getPath()));
        } else {
          fset.add(f);
        }
      }
    }
    return fset;
  }


  /**
   * Moves processed logs to a oldLogDir after successful processing Moves
   * corrupted logs (any log that couldn't be successfully parsed to corruptDir
   * (.corrupt) for later investigation
   *
   * @param corruptedLogs
   * @param processedLogs
   * @param oldLogDir
   * @param fs
   * @param conf
   * @throws IOException
   */
  private static void archiveLogs(
      final Path srcDir,
      final List<Path> corruptedLogs,
      final List<Path> processedLogs, final Path oldLogDir,
      final FileSystem fs, final Configuration conf) throws IOException {
    final Path corruptDir = new Path(conf.get(HConstants.HBASE_DIR), conf.get(
        "hbase.regionserver.hlog.splitlog.corrupt.dir", ".corrupt"));

    if (!fs.mkdirs(corruptDir)) {
      LOG.info("Unable to mkdir " + corruptDir);
    }
    fs.mkdirs(oldLogDir);

    // this method can get restarted or called multiple times for archiving
    // the same log files.
    for (Path corrupted : corruptedLogs) {
      Path p = new Path(corruptDir, corrupted.getName());
      if (fs.exists(corrupted)) {
        if (!fs.rename(corrupted, p)) {
          LOG.warn("Unable to move corrupted log " + corrupted + " to " + p);
        } else {
          LOG.warn("Moving corrupted log " + corrupted + " to " + p);
        }
      }
    }

    for (Path p : processedLogs) {
      Path newPath = HLog.getHLogArchivePath(oldLogDir, p);
      if (fs.exists(p)) {
        if (!fs.rename(p, newPath)) {
          LOG.warn("Unable to move  " + p + " to " + newPath);
        } else {
          LOG.debug("Archived processed log " + p + " to " + newPath);
        }
      }
    }

    // distributed log splitting removes the srcDir (region's log dir) later
    // when all the log files in that srcDir have been successfully processed
    if (srcDir != null && !fs.delete(srcDir, true)) {
      throw new IOException("Unable to delete src dir: " + srcDir);
    }
  }

  /**
   * Path to a file under RECOVERED_EDITS_DIR directory of the region found in
   * <code>logEntry</code> named for the sequenceid in the passed
   * <code>logEntry</code>: e.g. /hbase/some_table/2323432434/recovered.edits/2332.
   * This method also ensures existence of RECOVERED_EDITS_DIR under the region
   * creating it if necessary.
   * @param fs
   * @param logEntry
   * @param rootDir HBase root dir.
   * @return Path to file into which to dump split log edits.
   * @throws IOException
   */
  static Path getRegionSplitEditsPath(final FileSystem fs,
      final Entry logEntry, final Path rootDir, boolean isCreate)
  throws IOException {
    Path tableDir = HTableDescriptor.getTableDir(rootDir, logEntry.getKey()
        .getTablename());
    Path regiondir = HRegion.getRegionDir(tableDir,
        Bytes.toString(logEntry.getKey().getEncodedRegionName()));
    Path dir = HLog.getRegionDirRecoveredEditsDir(regiondir);

    if (!fs.exists(regiondir)) {
      LOG.info("This region's directory doesn't exist: "
          + regiondir.toString() + ". It is very likely that it was" +
          " already split so it's safe to discard those edits.");
      return null;
    }
    if (isCreate && !fs.exists(dir)) {
      if (!fs.mkdirs(dir)) LOG.warn("mkdir failed on " + dir);
    }
    // Append file name ends with RECOVERED_LOG_TMPFILE_SUFFIX to ensure
    // region's replayRecoveredEdits will not delete it
    String fileName = formatRecoveredEditsFileName(logEntry.getKey()
        .getLogSeqNum());
    fileName = getTmpRecoveredEditsFileName(fileName);
    return new Path(dir, fileName);
  }

  static String getTmpRecoveredEditsFileName(String fileName) {
    return fileName + HLog.RECOVERED_LOG_TMPFILE_SUFFIX;
  }

  /**
   * Convert path to a file under RECOVERED_EDITS_DIR directory without
   * RECOVERED_LOG_TMPFILE_SUFFIX
   * @param srcPath
   * @return dstPath without RECOVERED_LOG_TMPFILE_SUFFIX
   */
  static Path getCompletedRecoveredEditsFilePath(Path srcPath) {
    String fileName = srcPath.getName();
    if (fileName.endsWith(HLog.RECOVERED_LOG_TMPFILE_SUFFIX)) {
      fileName = fileName.split(HLog.RECOVERED_LOG_TMPFILE_SUFFIX)[0];
    }
    return new Path(srcPath.getParent(), fileName);
  }

  static String formatRecoveredEditsFileName(final long seqid) {
    return String.format("%019d", seqid);
  }

  /*
   * Parse a single hlog and put the edits in @splitLogsMap
   *
   * @param logfile to split
   * @param splitLogsMap output parameter: a map with region names as keys and a
   * list of edits as values
   * @param fs the filesystem
   * @param conf the configuration
   * @throws IOException
   * @throws CorruptedLogFileException if hlog is corrupted
   */
  private void parseHLog(final Reader in, Path path,
		EntryBuffers entryBuffers, final FileSystem fs,
    final Configuration conf, boolean skipErrors)
	throws IOException, CorruptedLogFileException {
    int editsCount = 0;
    try {
      Entry entry;
      while ((entry = getNextLogLine(in, path, skipErrors)) != null) {
        entryBuffers.appendEntry(entry);
        editsCount++;
      }
    } catch (InterruptedException ie) {
      IOException t = new InterruptedIOException();
      t.initCause(ie);
      throw t;
    } finally {
      LOG.debug("Pushed=" + editsCount + " entries from " + path);
    }
  }

  /**
   * Create a new {@link Reader} for reading logs to split.
   *
   * @param fs
   * @param file
   * @param conf
   * @return A new Reader instance
   * @throws IOException
   * @throws CorruptedLogFile
   */
  protected Reader getReader(FileSystem fs, FileStatus file, Configuration conf,
      boolean skipErrors)
      throws IOException, CorruptedLogFileException {
    Path path = file.getPath();
    long length = file.getLen();
    Reader in;


    // Check for possibly empty file. With appends, currently Hadoop reports a
    // zero length even if the file has been sync'd. Revisit if HDFS-376 or
    // HDFS-878 is committed.
    if (length <= 0) {
      LOG.warn("File " + path + " might be still open, length is 0");
    }

    try {
      FSUtils.getInstance(fs, conf).recoverFileLease(fs, path, conf);
      try {
        in = getReader(fs, path, conf);
      } catch (EOFException e) {
        if (length <= 0) {
          // TODO should we ignore an empty, not-last log file if skip.errors
          // is false? Either way, the caller should decide what to do. E.g.
          // ignore if this is the last log in sequence.
          // TODO is this scenario still possible if the log has been
          // recovered (i.e. closed)
          LOG.warn("Could not open " + path + " for reading. File is empty", e);
          return null;
        } else {
          // EOFException being ignored
          return null;
        }
      }
    } catch (IOException e) {
      if (!skipErrors) {
        throw e;
      }
      CorruptedLogFileException t =
        new CorruptedLogFileException("skipErrors=true Could not open hlog " +
            path + " ignoring");
      t.initCause(e);
      throw t;
    }
    return in;
  }

  static private Entry getNextLogLine(Reader in, Path path, boolean skipErrors)
  throws CorruptedLogFileException, IOException {
    try {
      return in.next();
    } catch (EOFException eof) {
      // truncated files are expected if a RS crashes (see HBASE-2643)
      LOG.info("EOF from hlog " + path + ".  continuing");
      return null;
    } catch (IOException e) {
      // If the IOE resulted from bad file format,
      // then this problem is idempotent and retrying won't help
      if (e.getCause() != null &&
          (e.getCause() instanceof ParseException ||
           e.getCause() instanceof org.apache.hadoop.fs.ChecksumException)) {
        LOG.warn("Parse exception " + e.getCause().toString() + " from hlog "
           + path + ".  continuing");
        return null;
      }
      if (!skipErrors) {
        throw e;
      }
      CorruptedLogFileException t =
        new CorruptedLogFileException("skipErrors=true Ignoring exception" +
            " while parsing hlog " + path + ". Marking as corrupted");
      t.initCause(e);
      throw t;
    }
  }


  private void writerThreadError(Throwable t) {
    thrown.compareAndSet(null, t);
  }

  /**
   * Check for errors in the writer threads. If any is found, rethrow it.
   */
  private void checkForErrors() throws IOException {
    Throwable thrown = this.thrown.get();
    if (thrown == null) return;
    if (thrown instanceof IOException) {
      throw (IOException)thrown;
    } else {
      throw new RuntimeException(thrown);
    }
  }
  /**
   * Create a new {@link Writer} for writing log splits.
   */
  protected Writer createWriter(FileSystem fs, Path logfile, Configuration conf)
      throws IOException {
    return HLog.createWriter(fs, logfile, conf);
  }

  /**
   * Create a new {@link Reader} for reading logs to split.
   */
  protected Reader getReader(FileSystem fs, Path curLogFile, Configuration conf)
      throws IOException {
    return HLog.getReader(fs, curLogFile, conf);
  }

  /**
   * Class which accumulates edits and separates them into a buffer per region
   * while simultaneously accounting RAM usage. Blocks if the RAM usage crosses
   * a predefined threshold.
   *
   * Writer threads then pull region-specific buffers from this class.
   */
  class EntryBuffers {
    Map<byte[], RegionEntryBuffer> buffers =
      new TreeMap<byte[], RegionEntryBuffer>(Bytes.BYTES_COMPARATOR);

    /* Track which regions are currently in the middle of writing. We don't allow
       an IO thread to pick up bytes from a region if we're already writing
       data for that region in a different IO thread. */
    Set<byte[]> currentlyWriting = new TreeSet<byte[]>(Bytes.BYTES_COMPARATOR);

    long totalBuffered = 0;
    long maxHeapUsage;

    EntryBuffers(long maxHeapUsage) {
      this.maxHeapUsage = maxHeapUsage;
    }

    /**
     * Append a log entry into the corresponding region buffer.
     * Blocks if the total heap usage has crossed the specified threshold.
     *
     * @throws InterruptedException
     * @throws IOException
     */
    void appendEntry(Entry entry) throws InterruptedException, IOException {
      HLogKey key = entry.getKey();

      RegionEntryBuffer buffer;
      long incrHeap;
      synchronized (this) {
        buffer = buffers.get(key.getEncodedRegionName());
        if (buffer == null) {
          buffer = new RegionEntryBuffer(key.getTablename(), key.getEncodedRegionName());
          buffers.put(key.getEncodedRegionName(), buffer);
        }
        incrHeap= buffer.appendEntry(entry);        
      }

      // If we crossed the chunk threshold, wait for more space to be available
      synchronized (dataAvailable) {
        totalBuffered += incrHeap;
        while (totalBuffered > maxHeapUsage && thrown.get() == null) {
          LOG.debug("Used " + totalBuffered + " bytes of buffered edits, waiting for IO threads...");
          dataAvailable.wait(3000);
        }
        dataAvailable.notifyAll();
      }
      checkForErrors();
    }

    synchronized RegionEntryBuffer getChunkToWrite() {
      long biggestSize=0;
      byte[] biggestBufferKey=null;

      for (Map.Entry<byte[], RegionEntryBuffer> entry : buffers.entrySet()) {
        long size = entry.getValue().heapSize();
        if (size > biggestSize && !currentlyWriting.contains(entry.getKey())) {
          biggestSize = size;
          biggestBufferKey = entry.getKey();
        }
      }
      if (biggestBufferKey == null) {
        return null;
      }

      RegionEntryBuffer buffer = buffers.remove(biggestBufferKey);
      currentlyWriting.add(biggestBufferKey);
      return buffer;
    }

    void doneWriting(RegionEntryBuffer buffer) {
      synchronized (this) {
        boolean removed = currentlyWriting.remove(buffer.encodedRegionName);
        assert removed;
      }
      long size = buffer.heapSize();

      synchronized (dataAvailable) {
        totalBuffered -= size;
        // We may unblock writers
        dataAvailable.notifyAll();
      }
    }

    synchronized boolean isRegionCurrentlyWriting(byte[] region) {
      return currentlyWriting.contains(region);
    }
  }

  /**
   * A buffer of some number of edits for a given region.
   * This accumulates edits and also provides a memory optimization in order to
   * share a single byte array instance for the table and region name.
   * Also tracks memory usage of the accumulated edits.
   */
  static class RegionEntryBuffer implements HeapSize {
    long heapInBuffer = 0;
    List<Entry> entryBuffer;
    byte[] tableName;
    byte[] encodedRegionName;

    RegionEntryBuffer(byte[] table, byte[] region) {
      this.tableName = table;
      this.encodedRegionName = region;
      this.entryBuffer = new LinkedList<Entry>();
    }

    long appendEntry(Entry entry) {
      internify(entry);
      entryBuffer.add(entry);
      long incrHeap = entry.getEdit().heapSize() +
        ClassSize.align(2 * ClassSize.REFERENCE) + // HLogKey pointers
        0; // TODO linkedlist entry
      heapInBuffer += incrHeap;
      return incrHeap;
    }

    private void internify(Entry entry) {
      HLogKey k = entry.getKey();
      k.internTableName(this.tableName);
      k.internEncodedRegionName(this.encodedRegionName);
    }

    public long heapSize() {
      return heapInBuffer;
    }
  }


  class WriterThread extends Thread {
    private volatile boolean shouldStop = false;

    WriterThread(int i) {
      super("WriterThread-" + i);
    }

    public void run()  {
      try {
        doRun();
      } catch (Throwable t) {
        LOG.error("Error in log splitting write thread", t);
        writerThreadError(t);
      }
    }

    private void doRun() throws IOException {
      LOG.debug("Writer thread " + this + ": starting");
      while (true) {
        RegionEntryBuffer buffer = entryBuffers.getChunkToWrite();
        if (buffer == null) {
          // No data currently available, wait on some more to show up
          synchronized (dataAvailable) {
            if (shouldStop) return;
            try {
              dataAvailable.wait(1000);
            } catch (InterruptedException ie) {
              if (!shouldStop) {
                throw new RuntimeException(ie);
              }
            }
          }
          continue;
        }

        assert buffer != null;
        try {
          writeBuffer(buffer);
        } finally {
          entryBuffers.doneWriting(buffer);
        }
      }
    }

    private void writeBuffer(RegionEntryBuffer buffer) throws IOException {
      List<Entry> entries = buffer.entryBuffer;
      if (entries.isEmpty()) {
        LOG.warn(this.getName() + " got an empty buffer, skipping");
        return;
      }

      WriterAndPath wap = null;

      long startTime = System.nanoTime();
      try {
        int editsCount = 0;

        for (Entry logEntry : entries) {
          if (wap == null) {
            wap = outputSink.getWriterAndPath(logEntry);
            if (wap == null) {
              // getWriterAndPath decided we don't need to write these edits
              // Message was already logged
              return;
            }
          }
          wap.w.append(logEntry);
          editsCount++;
        }
        // Pass along summary statistics
        wap.incrementEdits(editsCount);
        wap.incrementNanoTime(System.nanoTime() - startTime);
      } catch (IOException e) {
        e = RemoteExceptionHandler.checkIOException(e);
        LOG.fatal(this.getName() + " Got while writing log entry to log", e);
        throw e;
      }
    }

    void finish() {
      synchronized (dataAvailable) {
        shouldStop = true;
        dataAvailable.notifyAll();
      }
    }
  }

  private WriterAndPath createWAP(byte[] region, Entry entry,
      Path rootdir, String tmpname, FileSystem fs, Configuration conf)
  throws IOException {
    Path regionedits = getRegionSplitEditsPath(fs, entry, rootdir,
        tmpname==null);
    if (regionedits == null) {
      return null;
    }
    if ((tmpname == null) && fs.exists(regionedits)) {
      LOG.warn("Found existing old edits file. It could be the "
          + "result of a previous failed split attempt. Deleting "
          + regionedits + ", length="
          + fs.getFileStatus(regionedits).getLen());
      if (!fs.delete(regionedits, false)) {
        LOG.warn("Failed delete of old " + regionedits);
      }
    }
    Path editsfile;
    if (tmpname != null) {
      // During distributed log splitting the output by each
      // SplitLogWorker is written to a temporary area.
      editsfile = convertRegionEditsToTemp(rootdir, regionedits, tmpname);
    } else {
      editsfile = regionedits;
    }
    Writer w = createWriter(fs, editsfile, conf);
    LOG.debug("Creating writer path=" + editsfile + " region="
        + Bytes.toStringBinary(region));
    return (new WriterAndPath(editsfile, w));
  }

  Path convertRegionEditsToTemp(Path rootdir, Path edits, String tmpname) {
    List<String> components = new ArrayList<String>(10);
    do {
      components.add(edits.getName());
      edits = edits.getParent();
    } while (edits.depth() > rootdir.depth());
    Path ret = ZKSplitLog.getSplitLogDir(rootdir, tmpname);
    for (int i = components.size() - 1; i >= 0; i--) {
      ret = new Path(ret, components.get(i));
    }
    try {
      if (fs.exists(ret)) {
        LOG.warn("Found existing old temporary edits file. It could be the "
            + "result of a previous failed split attempt. Deleting "
            + ret + ", length="
            + fs.getFileStatus(ret).getLen());
        if (!fs.delete(ret, false)) {
          LOG.warn("Failed delete of old " + ret);
        }
      }
      Path dir = ret.getParent();
      if (!fs.exists(dir)) {
        if (!fs.mkdirs(dir)) LOG.warn("mkdir failed on " + dir);
      }
    } catch (IOException e) {
      LOG.warn("Could not prepare temp staging area ", e);
      // ignore, exceptions will be thrown elsewhere
    }
    return ret;
  }

  /**
   * Class that manages the output streams from the log splitting process.
   */
  class OutputSink {
    private final Map<byte[], WriterAndPath> logWriters = Collections.synchronizedMap(
          new TreeMap<byte[], WriterAndPath>(Bytes.BYTES_COMPARATOR));
    private final List<WriterThread> writerThreads = Lists.newArrayList();

    /* Set of regions which we've decided should not output edits */
    private final Set<byte[]> blacklistedRegions = Collections.synchronizedSet(
        new TreeSet<byte[]>(Bytes.BYTES_COMPARATOR));

    private boolean closeAndCleanCompleted = false;
    
    private boolean logWritersClosed  = false;

    /**
     * Start the threads that will pump data from the entryBuffers
     * to the output files.
     * @return the list of started threads
     */
    synchronized void startWriterThreads(EntryBuffers entryBuffers) {
      // More threads could potentially write faster at the expense
      // of causing more disk seeks as the logs are split.
      // 3. After a certain setting (probably around 3) the
      // process will be bound on the reader in the current
      // implementation anyway.
      int numThreads = conf.getInt(
          "hbase.regionserver.hlog.splitlog.writer.threads", 3);

      for (int i = 0; i < numThreads; i++) {
        WriterThread t = new WriterThread(i);
        t.start();
        writerThreads.add(t);
      }
    }

    List<Path> finishWritingAndClose() throws IOException {
      LOG.info("Waiting for split writer threads to finish");
      try {
        for (WriterThread t : writerThreads) {
          t.finish();
        }
        for (WriterThread t : writerThreads) {
          try {
            t.join();
          } catch (InterruptedException ie) {
            throw new IOException(ie);
          }
          checkForErrors();
        }
        LOG.info("Split writers finished");

        return closeStreams();
      } finally {
        List<IOException> thrown = closeLogWriters(null);
        if (thrown != null && !thrown.isEmpty()) {
          throw MultipleIOException.createIOException(thrown);
        }
      }
    }

    /**
     * Close all of the output streams.
     * @return the list of paths written.
     */
    private List<Path> closeStreams() throws IOException {
      Preconditions.checkState(!closeAndCleanCompleted);

      List<Path> paths = new ArrayList<Path>();
      List<IOException> thrown = Lists.newArrayList();
      closeLogWriters(thrown);
      for (WriterAndPath wap : logWriters.values()) {
        Path dst = getCompletedRecoveredEditsFilePath(wap.p);
        try {
          if (!dst.equals(wap.p) && fs.exists(dst)) {
            LOG.warn("Found existing old edits file. It could be the "
                + "result of a previous failed split attempt. Deleting " + dst
                + ", length=" + fs.getFileStatus(dst).getLen());
            if (!fs.delete(dst, false)) {
              LOG.warn("Failed deleting of old " + dst);
              throw new IOException("Failed deleting of old " + dst);
            }
          }
          // Skip the unit tests which create a splitter that reads and writes
          // the data without touching disk. TestHLogSplit#testThreading is an
          // example.
          if (fs.exists(wap.p)) {
            if (!fs.rename(wap.p, dst)) {
              throw new IOException("Failed renaming " + wap.p + " to " + dst);
            }
          }
        } catch (IOException ioe) {
          LOG.error("Couldn't rename " + wap.p + " to " + dst, ioe);
          thrown.add(ioe);
          continue;
        }
        paths.add(dst);
      }
      if (!thrown.isEmpty()) {
        throw MultipleIOException.createIOException(thrown);
      }

      closeAndCleanCompleted = true;
      return paths;
    }
    
    private List<IOException> closeLogWriters(List<IOException> thrown)
        throws IOException {
      if (!logWritersClosed) {
        if (thrown == null) {
          thrown = Lists.newArrayList();
        }
        for (WriterAndPath wap : logWriters.values()) {
          try {
            wap.w.close();
          } catch (IOException ioe) {
            LOG.error("Couldn't close log at " + wap.p, ioe);
            thrown.add(ioe);
            continue;
          }
          LOG.info("Closed path " + wap.p + " (wrote " + wap.editsWritten
              + " edits in " + (wap.nanosSpent / 1000 / 1000) + "ms)");
        }
        logWritersClosed = true;
      }
      return thrown;
    }

    /**
     * Get a writer and path for a log starting at the given entry.
     *
     * This function is threadsafe so long as multiple threads are always
     * acting on different regions.
     *
     * @return null if this region shouldn't output any logs
     */
    WriterAndPath getWriterAndPath(Entry entry) throws IOException {
      byte region[] = entry.getKey().getEncodedRegionName();
      WriterAndPath ret = logWriters.get(region);
      if (ret != null) {
        return ret;
      }
      // If we already decided that this region doesn't get any output
      // we don't need to check again.
      if (blacklistedRegions.contains(region)) {
        return null;
      }
      ret = createWAP(region, entry, rootDir, null, fs, conf);
      if (ret == null) {
        blacklistedRegions.add(region);
        return null;
      }
      logWriters.put(region, ret);
      return ret;
    }

    /**
     * @return a map from encoded region ID to the number of edits written out
     * for that region.
     */
    private Map<byte[], Long> getOutputCounts() {
      TreeMap<byte[], Long> ret = new TreeMap<byte[], Long>(
          Bytes.BYTES_COMPARATOR);
      synchronized (logWriters) {
        for (Map.Entry<byte[], WriterAndPath> entry : logWriters.entrySet()) {
          ret.put(entry.getKey(), entry.getValue().editsWritten);
        }
      }
      return ret;
    }
  }



  /**
   *  Private data structure that wraps a Writer and its Path,
   *  also collecting statistics about the data written to this
   *  output.
   */
  private final static class WriterAndPath {
    final Path p;
    final Writer w;

    /* Count of edits written to this path */
    long editsWritten = 0;
    /* Number of nanos spent writing to this log */
    long nanosSpent = 0;

    WriterAndPath(final Path p, final Writer w) {
      this.p = p;
      this.w = w;
    }

    void incrementEdits(int edits) {
      editsWritten += edits;
    }

    void incrementNanoTime(long nanos) {
      nanosSpent += nanos;
    }
  }

  static class CorruptedLogFileException extends Exception {
    private static final long serialVersionUID = 1L;
    CorruptedLogFileException(String s) {
      super(s);
    }
  }
}