* Digest
* (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
* first published 28.12.2008 on http://yacy.net
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* Lesser General Public License for more details.
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
package net.yacy.cora.order;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.RandomAccessFile;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Queue;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.LinkedBlockingQueue;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.storage.ARC;
import net.yacy.cora.storage.ConcurrentARC;
import net.yacy.cora.util.Memory;
public class Digest {
public static Queue<MessageDigest> digestPool = new ConcurrentLinkedQueue<MessageDigest>();
private static final int md5CacheSize = Math.max(1000, Math.min(1000000, (int) (Memory.available() / 50000L)));
private static ARC<String, byte[]> md5Cache = null;
static {
try {
md5Cache = new ConcurrentARC<String, byte[]>(md5CacheSize, Math.max(8, 2 * Runtime.getRuntime().availableProcessors()));
} catch (final OutOfMemoryError e) {
md5Cache = new ConcurrentARC<String, byte[]>(1000, Math.max(2, Runtime.getRuntime().availableProcessors()));
* clean the md5 cache
public static void cleanup() {
public static String encodeHex(final long in, final int length) {
String s = Long.toHexString(in);
while (s.length() < length) s = "0" + s;
return s;
public static String encodeOctal(final byte[] in) {
if (in == null) return "";
final StringBuilder result = new StringBuilder(in.length * 8 / 3);
for (final byte element : in) {
if ((0Xff & element) < 8) result.append('0');
result.append(Integer.toOctalString(0Xff & element));
return result.toString();
public static String encodeHex(final byte[] in) {
if (in == null) return "";
final StringBuilder result = new StringBuilder(in.length * 2);
for (final byte element : in) {
if ((0Xff & element) < 16) result.append('0');
result.append(Integer.toHexString(0Xff & element));
return result.toString();
public static byte[] decodeHex(final String hex) {
final byte[] result = new byte[hex.length() / 2];
for (int i = 0; i < result.length; i++) {
result[i] = (byte) (16 * Integer.parseInt(Character.toString(hex.charAt(i * 2)), 16) + Integer.parseInt(Character.toString(hex.charAt(i * 2 + 1)), 16));
return result;
public static String encodeMD5Hex(final String key) {
// generate a hex representation from the md5 of a string
return encodeHex(encodeMD5Raw(key));
public static String encodeMD5Hex(final File file) throws IOException {
// generate a hex representation from the md5 of a file
return encodeHex(encodeMD5Raw(file));
public static String encodeMD5Hex(final byte[] b) {
// generate a hex representation from the md5 of a byte-array
return encodeHex(encodeMD5Raw(b));
public static byte[] encodeMD5Raw(final String key) {
byte[] h = md5Cache.get(key);
if (h != null) return h;
MessageDigest digest = digestPool.poll();
if (digest == null) {
// if there are no digest objects left, create some on the fly
// this is not the most effective way but if we wouldn't do that the encoder would block
try {
digest = MessageDigest.getInstance("MD5");
} catch (final NoSuchAlgorithmException e) {
} else {
digest.reset(); // they should all be reseted but anyway; this is safe
byte[] keyBytes;
keyBytes = UTF8.getBytes(key);
final byte[] result = digest.digest();
digest.reset(); // to be prepared for next
//System.out.println("Digest Pool size = " + digestPool.size());
// update the cache
md5Cache.insertIfAbsent(key, result); // prevent expensive MD5 computation and encoding
return result;
public static byte[] encodeMD5Raw(final File file) throws IOException {
FileInputStream in = null;
try {
in = new FileInputStream(file);
} catch (final java.io.FileNotFoundException e) {
System.out.println("file not found:" + file.toString());
return null;
// create a concurrent thread that consumes data as it is read
// and computes the md5 while doing IO
final md5FilechunkConsumer md5consumer = new md5FilechunkConsumer(1024 * 64, 8);
final ExecutorService service = Executors.newSingleThreadExecutor();
final Future<MessageDigest> md5result = service.submit(md5consumer);
filechunk c;
try {
while (true) {
c = md5consumer.nextFree();
if (c == null) throw new IOException("c == null, probably interrupted");
c.n = in.read(c.b);
if (c.n <= 0) break;
} catch (final IOException e) {
System.out.println("file error with " + file.toString() + ": " + e.getMessage());
throw e;
} finally {
try {in.close();} catch (final IOException e) {}
// put in poison into queue to tell the consumer to stop
// return the md5 digest from future task
try {
return md5result.get().digest();
} catch (final InterruptedException e) {
throw new IOException(e);
} catch (final ExecutionException e) {
throw new IOException(e);
private static class filechunk {
public byte[] b;
public int n;
public filechunk(final int len) {
this.b = new byte[len];
this.n = 0;
private static class md5FilechunkConsumer implements Callable<MessageDigest> {
private final BlockingQueue<filechunk> empty;
private final BlockingQueue<filechunk> filed;
protected static filechunk poison = new filechunk(0);
private MessageDigest digest;
public md5FilechunkConsumer(final int bufferSize, final int bufferCount) {
this.empty = new ArrayBlockingQueue<filechunk>(bufferCount);
this.filed = new LinkedBlockingQueue<filechunk>();
// fill the empty queue
for (int i = 0; i < bufferCount; i++) this.empty.add(new filechunk(bufferSize));
// init digest
try {
this.digest = MessageDigest.getInstance("MD5");
} catch (final NoSuchAlgorithmException e) {
System.out.println("Internal Error at md5:" + e.getMessage());
public void consume(final filechunk c) {
try {
} catch (final InterruptedException e) {
public filechunk nextFree() throws IOException {
try {
return this.empty.take();
} catch (final InterruptedException e) {
throw new IOException(e);
public MessageDigest call() {
try {
filechunk c;
while(true) {
c = this.filed.take();
if (c == poison) break;
this.digest.update(c.b, 0, c.n);
} catch (final InterruptedException e) {
return this.digest;
public static String fastFingerprintHex(final File file, final boolean includeDate) {
try {
return encodeHex(fastFingerprintRaw(file, includeDate));
} catch (final IOException e) {
return null;
public static String fastFingerprintB64(final File file, final boolean includeDate) {
try {
final byte[] b = fastFingerprintRaw(file, includeDate);
assert b != null : "file = " + file.toString();
if (b == null || b.length == 0) return null;
assert b.length != 0 : "file = " + file.toString();
return Base64Order.enhancedCoder.encode(b);
} catch (final IOException e) {
return null;
* the fast fingerprint computes a md5-like hash from a given file,
* which is different from a md5 because it does not read the complete file
* but reads only the first and last megabyte of it. In case that the
* file is less or equal of one megabyte, the fast fingerprint is equal
* to the md5. In other cases the fingerprint is computed from a
* array = byte[32k + 8] array, which consists of:
* array[0 .. 16k - 1] = first MB of file
* array[16k .. 32k - 1] = last MB of file
* array[32k .. 32k + 7] = length of file as long
* if the date flag is set, the array is extended to
* array[32k + 8 .. 32k + 15] = lastModified of file as long
* @param file
* @return fingerprint in md5 raw format
* @throws IOException
public static byte[] fastFingerprintRaw(final File file, final boolean includeDate) throws IOException {
final int mb = 16 * 1024;
final long fl = file.length();
if (fl <= 2 * mb) return encodeMD5Raw(file);
MessageDigest digest;
try {
digest = MessageDigest.getInstance("MD5");
} catch (final NoSuchAlgorithmException e) {
return null;
final RandomAccessFile raf = new RandomAccessFile(file, "r");
final byte[] a = new byte[mb];
try {
raf.readFully(a, 0, mb);
digest.update(a, 0, mb);
raf.seek(fl - mb);
raf.readFully(a, 0, mb);
digest.update(a, 0, mb);
digest.update(NaturalOrder.encodeLong(fl, 8), 0, 8);
if (includeDate) digest.update(NaturalOrder.encodeLong(file.lastModified(), 8), 0, 8);
} finally {
try {raf.getChannel().close();} catch (final IOException e) {}
return digest.digest();
private static byte[] encodeMD5Raw(final byte[] b) {
try {
final MessageDigest digest = MessageDigest.getInstance("MD5");
final InputStream in = new ByteArrayInputStream(b);
final byte[] buf = new byte[2048];
int n;
while ((n = in.read(buf)) > 0) digest.update(buf, 0, n);
// now compute the hex-representation of the md5 digest
return digest.digest();
} catch (final java.security.NoSuchAlgorithmException e) {
System.out.println("Internal Error at md5:" + e.getMessage());
} catch (final java.io.IOException e) {
System.out.println("byte[] error: " + e.getMessage());
return null;
public static void main(final String[] s) {
// usage example:
// java -classpath classes de.anomic.kelondro.kelondroDigest -md5 DATA/HTCACHE/mediawiki/wikipedia.de.xml
// java -classpath classes de.anomic.kelondro.kelondroDigest -md5 readme.txt
// java -classpath classes de.anomic.kelondro.kelondroDigest -fb64 DATA/HTCACHE/responseHeader.heap
// compare with:
// md5 readme.txt
final long start = System.currentTimeMillis();
if (s.length == 0) {
System.out.println("usage: -[md5|fingerprint] <arg>");
if (s[0].equals("-md5")) {
// generate a md5 from a given file
final File f = new File(s[1]);
try {
System.out.println("MD5 (" + f.getName() + ") = " + encodeMD5Hex(f));
} catch (final IOException e) {
if (s[0].equals("-fhex")) {
// generate a fast fingerprint from a given file
final File f = new File(s[1]);
System.out.println("fingerprint hex (" + f.getName() + ") = " + fastFingerprintHex(f, true));
if (s[0].equals("-fb64")) {
// generate a fast fingerprint from a given file
final File f = new File(s[1]);
System.out.println("fingerprint b64 (" + f.getName() + ") = " + fastFingerprintB64(f, true));
// Takes a string as input.
// Please don't delete this without making sure that it is not needed by reconfigureYACY.sh anymore. (Low012)
if (s[0].equals("-strfhex") && s.length > 1) {
System.out.println("time: " + (System.currentTimeMillis() - start) + " ms");