/**
* ClientIdentification
* Copyright 2011 by Michael Peter Christen, mc@yacy.net, Frankfurt a. M., Germany
* First released 26.04.2011 at http://yacy.net
*
* $LastChangedDate: 2011-04-21 23:59:56 +0200 (Do, 21 Apr 2011) $
* $LastChangedRevision: 7673 $
* $LastChangedBy: orbiter $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.protocol;
import java.util.Map;
import java.util.Random;
import java.util.concurrent.ConcurrentHashMap;
public class ClientIdentification {
public static final int clientTimeoutInit = 10000;
public static final int minimumLocalDeltaInit = 10; // the minimum time difference between access of the same local domain
public static final int minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain
public static class Agent {
public final String userAgent; // the name that is send in http request to identify the agent
public final String[] robotIDs; // the name that is used in robots.txt to identify the agent
public final int minimumDelta; // the minimum delay between two accesses
public final int clientTimeout;
public Agent(final String userAgent, final String[] robotIDs, final int minimumDelta, final int clientTimeout) {
this.userAgent = userAgent;
this.robotIDs = robotIDs;
this.minimumDelta = minimumDelta;
this.clientTimeout = clientTimeout;
}
}
private final static String[] browserAgents = new String[]{ // fake browser user agents are NOT AVAILABLE IN P2P OPERATION, only on special customer configurations (commercial users demanded this, I personally think this is inadvisable)
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/536.30.1 (KHTML, like Gecko) Version/6.0.5 Safari/536.30.1",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.71 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:22.0) Gecko/20100101 Firefox/22.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0",
"Mozilla/5.0 (Windows NT 5.1; rv:22.0) Gecko/20100101 Firefox/22.0",
"Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20100101 Firefox/22.0",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36",
"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:21.0) Gecko/20100101 Firefox/21.0"
};
private static final Random random = new Random(System.currentTimeMillis());
private static Map<String, Agent> agents = new ConcurrentHashMap<String, Agent>();
public final static String yacyInternetCrawlerAgentName = "YaCy Internet (cautious)";
public static Agent yacyInternetCrawlerAgent = null; // defined later in static
public final static String yacyIntranetCrawlerAgentName = "YaCy Intranet (greedy)";
public static Agent yacyIntranetCrawlerAgent = null; // defined later in static
public final static String googleAgentName = "Googlebot";
public final static Agent googleAgentAgent = new Agent("Googlebot/2.1 (+http://www.google.com/bot.html)", new String[]{"Googlebot", "Googlebot-Mobile"}, minimumGlobalDeltaInit / 2, clientTimeoutInit);
public final static String yacyProxyAgentName = "YaCyProxy";
public final static Agent yacyProxyAgent = new Agent("yacy - this is a proxy access through YaCy from a browser, not a robot (the yacy bot user agent is 'yacybot')", new String[]{"yacy"}, minimumGlobalDeltaInit, clientTimeoutInit);
public final static String customAgentName = "Custom Agent";
public final static String browserAgentName = "Random Browser";
public static Agent browserAgent;
static {
generateYaCyBot("new");
browserAgent = new Agent(browserAgents[random.nextInt(browserAgents.length)], new String[]{"Mozilla"}, minimumLocalDeltaInit, clientTimeoutInit);
agents.put(googleAgentName, googleAgentAgent);
agents.put(browserAgentName, browserAgent);
agents.put(yacyProxyAgentName, yacyProxyAgent);
}
/**
* provide system information (this is part of YaCy protocol)
*/
public static final String yacySystem = System.getProperty("os.arch", "no-os-arch") + " " +
System.getProperty("os.name", "no-os-name") + " " + System.getProperty("os.version", "no-os-version") +
"; " + "java " + System.getProperty("java.version", "no-java-version") + "; " + generateLocation();
/**
* produce a YaCy user agent string
* @param addinfo
* @return
*/
public static void generateYaCyBot(String addinfo) {
String agentString = "yacybot (" + addinfo + "; " + yacySystem + ") http://yacy.net/bot.html";
yacyInternetCrawlerAgent = new Agent(agentString, new String[]{"yacybot"}, minimumGlobalDeltaInit, clientTimeoutInit);
yacyIntranetCrawlerAgent = new Agent(agentString, new String[]{"yacybot"}, minimumLocalDeltaInit, clientTimeoutInit); // must have the same userAgent String as the web crawler because this is also used for snippets
agents.put(yacyInternetCrawlerAgentName, yacyInternetCrawlerAgent);
agents.put(yacyIntranetCrawlerAgentName, yacyIntranetCrawlerAgent);
}
public static void generateCustomBot(String name, String string, int minimumdelta, int clienttimeout) {
if (name.toLowerCase().indexOf("yacy") >= 0 || string.toLowerCase().indexOf("yacy") >= 0) return; // don't allow 'yacy' in custom bot strings
String agentString = string.replace("$$SYSTEM$$", yacySystem.replace("java", "O"));
agents.put(customAgentName, new Agent(agentString, new String[]{name}, minimumdelta, clienttimeout));
}
/**
* get the default agent
* @param newagent
*/
public static Agent getAgent(String agentName) {
if (agentName == null || agentName.length() == 0) return yacyInternetCrawlerAgent;
Agent agent = agents.get(agentName);
return agent == null ? yacyInternetCrawlerAgent : agent;
}
/**
* generating the location string
*
* @return
*/
public static String generateLocation() {
String loc = System.getProperty("user.timezone", "nowhere");
final int p = loc.indexOf('/');
if (p > 0) {
loc = loc.substring(0, p);
}
loc = loc + "/" + System.getProperty("user.language", "dumb");
return loc;
}
/**
* gets the location out of the user agent
*
* location must be after last ; and before first )
*
* @param userAgent in form "useragentinfo (some params; _location_) additional info"
* @return
*/
public static String parseLocationInUserAgent(final String userAgent) {
final String location;
final int firstOpenParenthesis = userAgent.indexOf('(');
final int lastSemicolon = userAgent.lastIndexOf(';');
final int firstClosedParenthesis = userAgent.indexOf(')');
if (lastSemicolon < firstClosedParenthesis) {
// ; Location )
location = (firstClosedParenthesis > 0) ? userAgent.substring(lastSemicolon + 1, firstClosedParenthesis)
.trim() : userAgent.substring(lastSemicolon + 1).trim();
} else {
if (firstOpenParenthesis < userAgent.length()) {
if (firstClosedParenthesis > firstOpenParenthesis) {
// ( Location )
location = userAgent.substring(firstOpenParenthesis + 1, firstClosedParenthesis).trim();
} else {
// ( Location <end>
location = userAgent.substring(firstOpenParenthesis + 1).trim();
}
} else {
location = "";
}
}
return location;
}
}