isGooglebotUA() === TRUE || $this->isGoogleCrawlerUA() === TRUE ){ // this is the correct entry for the check
$crawlerIP = defaultGetIP();
$trimmed = file($storelist, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
/*
If we find the current bot ip in our offical crawler ip flatfile $storelist
then simply validate this as a valid crawler
*/
if (array_key_exists($crawlerIP, $trimmed)) {
return 1;
}
$reverse = $this->reverseLookup($crawlerIP);
$isValid = $this->matchGoogleBots($crawlerIP, $reverse);
if ( $isValid == 1 ){
file_put_contents($storelist, "$crawlerIP\n", FILE_APPEND | LOCK_EX);
}
return $isValid; // 0=fake google bot // 1=true google bot
}
return -1; // intern error
}
public function isValidCrawler( $storelist = "/tmp/iosec_botlog.txt" ){
$crawlerIP = $this->defaultGetIP();
$trimmed = file($storelist, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
/*
If we find the current bot ip in our offical crawler ip flatfile $storelist
then simply validate this as a valid crawler
*/
if (array_key_exists($crawlerIP, $trimmed)) {
return 1;
}
$reverse = $this->reverseLookup($crawlerIP);
$isValid = $this->matchBotsAll($crawlerIP, $reverse);
if ( $isValid == 1 ){
file_put_contents($storelist, "$crawlerIP\n" , FILE_APPEND | LOCK_EX);
}
return $isValid; // 0=fake bot // 1=true valid offical bot
}
public function matchBotsAll($orgIP, $reverse){
$reverse_host = str_replace(".","-", $orgIP);
$babot = "baiduspider-$reverse_host\.crawl\.baidu\.com";
$gbot = "crawl-$reverse_host\.googlebot\.com";
$yabot = "\.crawl\.yahoo\.com";
$yanbot = "spider-$reverse_host\.yandex\.com";
$bibot = "msnbot-$reverse_host\.search.msn\.com";
$alexbot = "$reverse_host\.compute-1\.amazonaws\.com";
//echo "matchBots( $gbot)
";
if ( preg_match("/$babot/i", $reverse ) || preg_match("/$gbot/i", $reverse ) || preg_match("/$yabot/i", $reverse ) || preg_match("/$yanbot/i", $reverse ) || preg_match("/$bibot/i", $reverse ) || preg_match("/$alexbot/i", $reverse ) ) {
# print "(VALID): $host is offical bot\n";
return 1;
} else {
# print "(ERROR): $host is NOT offical bot\n";
return 0;
}
}
public function matchGoogleBots($orgIP, $reverse){
$reverse_host = str_replace(".","-", $orgIP);
$babot = "baiduspider-$reverse_host\.crawl\.baidu\.com";
$gbot = "crawl-$reverse_host\.googlebot\.com";
$yabot = "\.crawl\.yahoo\.com";
$yanbot = "spider-$reverse_host\.yandex\.com";
$bibot = "msnbot-$reverse_host\.search.msn\.com";
$alexbot = "$reverse_host\.compute-1\.amazonaws\.com";
//echo "matchBots( $gbot)
";
if ( preg_match("/$gbot/i", $reverse ) ) {
# print "(VALID): $host is offical googlebot\n";
return 1;
} else {
# print "(ERROR): $host is NOT offical googlebot\n";
return 0;
}
}
public function defaultGetIP(){
$IP = 0;
if(isset($_SERVER['HTTP_X_FORWARDED_FOR'])){
$IP = $_SERVER['HTTP_X_FORWARDED_FOR'];
if(is_array($IP) && isset($IP[0])){ $IP = $IP[0]; } //It seems that some hosts may modify _SERVER vars into arrays.
}
if((! preg_match('/(\d+)\.(\d+)\.(\d+)\.(\d+)/', $IP)) && isset($_SERVER['HTTP_X_REAL_IP'])){
$IP = $_SERVER['HTTP_X_REAL_IP'];
if(is_array($IP) && isset($IP[0])){ $IP = $IP[0]; } //It seems that some hosts may modify _SERVER vars into arrays.
}
if((! preg_match('/(\d+)\.(\d+)\.(\d+)\.(\d+)/', $IP)) && isset($_SERVER['REMOTE_ADDR'])){
$IP = $_SERVER['REMOTE_ADDR'];
if(is_array($IP) && isset($IP[0])){ $IP = $IP[0]; } //It seems that some hosts may modify _SERVER vars into arrays.
}
return trim($IP);
}
public function reverseLookup($IP){
$ptr = implode(".", array_reverse(explode(".",$IP))) . ".in-addr.arpa";
$host = @dns_get_record($ptr, DNS_PTR);
if($host == null){
$host = 'NONE';
} else {
$host = $host[0]['target'];
}
if($host == 'NONE'){
return 'NONE';
} else {
return $host;
}
}
public function isValidIP($IP){
if(preg_match('/^(\d+)\.(\d+)\.(\d+)\.(\d+)$/', $IP, $m)){
if(
$m[0] >= 0 && $m[0] <= 255 &&
$m[1] >= 0 && $m[1] <= 255 &&
$m[2] >= 0 && $m[2] <= 255 &&
$m[3] >= 0 && $m[3] <= 255
){
return true;
}
}
return false;
}
public function getRequestedURL(){
if(isset($_SERVER['HTTP_HOST']) && $_SERVER['HTTP_HOST']){
$host = $_SERVER['HTTP_HOST'];
} else {
$host = $_SERVER['SERVER_NAME'];
}
$prefix = 'http';
if( isset($_SERVER['HTTPS']) && $_SERVER['HTTPS'] ){
$prefix = 'https';
}
return $prefix . '://' . $host . $_SERVER['REQUEST_URI'];
}
public function inet_ntoa($ip){
$long = 4294967295 - ($ip - 1);
return long2ip(-$long);
}
public function inet_aton($ip){
return sprintf("%u", ip2long($ip));
}
public function isGooglebotUA(){
$UA = (isset($_SERVER['HTTP_USER_AGENT']) ? $_SERVER['HTTP_USER_AGENT'] : '');
if(preg_match('/Googlebot\/\d\.\d/', $UA) && preg_match('/Google/ig', $UA)){ // UA: Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html) or (rarely used): Googlebot/2.1 (+http://www.google.com/bot.html)
return true;
}
return false;
}
public function isGoogleCrawlerUA(){
$UA = (isset($_SERVER['HTTP_USER_AGENT']) ? $_SERVER['HTTP_USER_AGENT'] : '');
$googPat = array(
'@^Mozilla/5\\.0 \\(.*Google Keyword Tool.*\\)$@',
'@^Mozilla/5\\.0 \\(.*Feedfetcher\\-Google.*\\)$@',
'@^Feedfetcher\\-Google\\-iGoogleGadgets.*$@',
'@^searchbot admin\\@google\\.com$@',
'@^Google\\-Site\\-Verification.*$@',
'@^Google OpenSocial agent.*$@',
'@^.*Googlebot\\-Mobile/2\\..*$@',
'@^AdsBot\\-Google\\-Mobile.*$@',
'@^google \\(.*Enterprise.*\\)$@',
'@^Mediapartners\\-Google.*$@',
'@^GoogleFriendConnect.*$@',
'@^googlebot\\-urlconsole$@',
'@^.*Google Web Preview.*$@',
'@^Feedfetcher\\-Google.*$@',
'@^AppEngine\\-Google.*$@',
'@^Googlebot\\-Video.*$@',
'@^Googlebot\\-Image.*$@',
'@^Google\\-Sitemaps.*$@',
'@^Googlebot/Test.*$@',
'@^Googlebot\\-News.*$@',
'@^.*Googlebot/2\\.1.*$@',
'@^.*Googlebot*$@',
'@^AdsBot\\-Google.*$@',
'@^Google$@'
);
foreach($googPat as $pat){
if(preg_match($pat . 'i', $UA)){
return true;
}
}
return false;
}
}
?>