<?php define('MEMORY_LIMIT', 50); define('SM_VERSION', 0.2); if(!function_exists('file_put_contents')) { if (!defined('LOCK_EX')) define('LOCK_EX', 2); function file_put_contents($filename, $content, $flags = null) { if ( ( $fh = @fopen($filename, 'wb') ) === false) return false; if (!@flock($fh, LOCK_EX)) return false; if ((@fwrite($fh, $content)) === false) return false; @fclose($fh); return true; } } if (!function_exists('file_get_contents')) { function file_get_contents($filename) { if (false === $fh = @fopen($filename, 'rb') ) { return false; } clearstatcache(); if ($fsize = @filesize($filename)) { $data = fread($fh, $fsize); } else { $data = ''; while (!feof($fh)) { $data .= fread($fh, 8192); } } fclose($fh); return $data; } } if ( !function_exists('sys_get_temp_dir') ) { // Based on http://www.phpit.net/ // article/creating-zip-tar-archives-dynamically-php/2/ function sys_get_temp_dir() { // Try to get from environment variable if ( !empty($_ENV['TMP']) ) { return realpath( $_ENV['TMP'] ); } else if ( !empty($_ENV['TMPDIR']) ) { return realpath( $_ENV['TMPDIR'] ); } else if ( !empty($_ENV['TEMP']) ) { return realpath( $_ENV['TEMP'] ); } // Detect by creating a temporary file else { // Try to use system's temporary directory // as random name shouldn't exist $temp_file = tempnam( md5(uniqid(rand(), TRUE)), '' ); if ( $temp_file ) { $temp_dir = realpath( dirname($temp_file) ); unlink( $temp_file ); return $temp_dir; } else { return FALSE; } } } } /*caching function*/ function & _cache($name, &$val=0, $checking = false){ $retVal = false; if(!USE_CACHE) return $retVal; if(!is_dir(CACHE) ){ _error('Cache Directory <strong>"'.CACHE.'"</strong> does not exist', 'WARNING'); return $retVal; } $file = CACHE.$name; if(!empty($val)){ if( !@file_put_contents($file,serialize($val),LOCK_EX) ){ _error('Could not write cache, please check Cache folder <strong>"'.CACHE.'"</strong> is writtable', 'WARNING'); }else{ $retVal = true; } }elseif( file_exists($file) && time()-filemtime($file) < CACHE_TTL ) if($checking) $retVal = true; elseif( !($cont=@unserialize(@file_get_contents($file))) ){ _error('Could not get cache!', 'WARNING'); }else return $cont; return $retVal; } /*retrieves urls contents*/ /*using curl library is much faster*/ function & get_url($url ){ #checking for cached version if( $cont =& _cache(md5($url)) ) return $cont; $cont = array(); $fbug = false; $cookie_file = sys_get_temp_dir().'/smc_cookies'; $user_agent = "Sitemaps Creator ".SM_VERSION."(compatible; sitemapcreatorbot/".SM_VERSION."; +http://gadelkareem.com/) "; if(strpos($url,'http://')===false) $url = 'http://'.$url; ini_set('user_agent', $user_agent); if( USE_FOPEN ) $cont['content']= trim(file_get_contents($url)); elseif( !USE_CURL || !function_exists('curl_init')){ $parse = parse_url($url); $host = $parse['host']; if ( empty($parse['port']) ) { if ($parse['scheme'] == 'https') { $port = "443"; } else { $port = "80"; } }else $port = $parse['port']; $cookies = file_exists($cookie_file) ? @file_get_contents($cookie_file) : ''; if ( ($fp = @fsockopen($host, $port, $errno, $errstr, CONNECT_TIME_OUT)) === false) { switch($errno) { case -3: _error( 'Socket creation failed (-3)', 'WARNING'); case -4: _error( 'DNS lookup failure (-4)', 'WARNING'); case -5: _error( 'Connection refused or timed out (-5)', 'WARNING'); default: _error( 'Connection failed ('.$errno.') '.$errstr, 'WARNING'); } _cache(md5($url), ($cont=array()) ); return $fbug; } socket_set_timeout($fp, CONNECT_TIME_OUT); #code from http://enarion.net/google/ crawler class $query_encoded = ''; if (!empty($parse['query']) ) { $query_encoded = '?'; foreach (split('&', $parse['query']) as $id => $quer) { $v = split('=', $quer); if (!empty($v[1])) { $query_encoded .= $v[0].'='.rawurlencode(urldecode($v[1])).'&'; } else { $query_encoded .= $v[0].'&'; } } $query_encoded = substr($query_encoded, 0, strlen($query_encoded) - 1); $query_encoded = str_replace('%2B','+', $query_encoded); } $get = "GET {$parse['path']}{$query_encoded} HTTP/1.1\r\n"; $get .= "Host: {$host}\r\n"; $get .= "User-Agent: {$user_agent})\r\n"; $get .= "Referer: {$parse['scheme']}://{$host}{$parse['path']}\r\n"; $get .= "Connection: close\r\n\r\n"; socket_set_blocking($fp, true); fwrite($fp, $get); $res = ''; $head_done = false; $currentHeader = ''; while ( '' != ($line=trim(fgets($fp, 1024))) ) { if ( false !== ($pos=strpos($line, ':')) ) $header[str_replace('-', '_', strtolower(substr($line, 0, $pos)))] = trim(substr($line, $pos+1)); elseif( preg_match('#\s([0-9]{3})\s#i', $line, $code) ) $header['http_code'] = intval($code[0]); } $header['url'] = $url; if ( !empty($header['location']) ){ $cont = & get_url($header['location']); $header['url'] = $header['location']; } if (isset($header['transfer_encoding']) && $header['transfer_encoding'] == 'chunked') { $chunk = hexdec(fgets($fp, 1024)); } else { $chunk = -1; } $res = ''; while ($chunk != 0 && !feof($fp)) { if ($chunk > 0){ $part = fread($fp, $chunk); $chunk -= strlen($part); $res .= $part; if ($chunk == 0){ if (fgets($fp, 1024) != "\r\n") ; $chunk = hexdec(fgets($fp, 1024)); } } else { $res .= fread($fp, 1024); } } fclose($fp); $cont['content'] = $res; }else{ #curl exists $options = array( CURLOPT_RETURNTRANSFER => true, CURLOPT_FOLLOWLOCATION => true, CURLOPT_USERAGENT => $user_agent, CURLOPT_CONNECTTIMEOUT => CONNECT_TIME_OUT, CURLOPT_TIMEOUT => CONNECT_TIME_OUT, CURLOPT_MAXREDIRS => 10, CURLOPT_COOKIEJAR => $cookie_file, CURLOPT_COOKIEFILE => $cookie_file, ); $ch = curl_init($url); curl_setopt_array( $ch, $options ); $cont['content'] = trim(curl_exec($ch)); $err = curl_errno( $ch ); $errmsg = curl_error( $ch ); if($err){ _error("{$errmsg} for URL <strong>{$url}</strong>", 'ERROR'.$err); _cache(md5($url), ($cont=array()) ); return $fbug; } $header = curl_getinfo($ch); curl_close($ch); } if(isset($header)){ if ( stripos($header['content_type'], 'text') === false ){ _error("Document type is <strong>{$header['content_type']}</strong> for URL <strong>{$url}</strong>"); _cache(md5($url), ($cont=array()) ); return $fbug; } if ( $header['http_code'] != 200 ){ #http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html _error("<strong>{$header['http_code']}</strong> Found for URL <strong>{$url}</strong>"); } if ( $header['url'] != $url ){ $url = $cont['new_url'] = $header['url']; } } #caching _cache(md5($url), $cont); #if(file_exists($cookie_file)) @unlink($cookie_file); #_echo($cookie_file); return $cont; } function cleanurl($url){ return preg_replace("/(sid=[\w\d]+)|(\?$)/i", "", str_replace("&", "&", $url)); } /*reformats the urls */ function make_url($url,$sub=''){ $sub = trim($sub); if( preg_match('#^([^:/]+://)(www\.)?([^/]+)/?([^\#]*)$#i', $sub, $matches) ){ return cleanurl( (USE_WWW ? $matches[1].$matches[2].$matches[3].'/'.$matches[4] : $matches[1].$matches[3].'/'.$matches[4]) ); } if($sub=='' || $sub == '/') return $url; while( preg_match('#^/?\.+/#', $sub) ){ $sub = preg_replace('#^/?\.+/#','/',$sub); $url = preg_replace('#(/[^/]+)(/[^/]+)$#','\\1',$url); } if( $sub{0} == '/' ) return cleanurl('http://'.SITE.$sub); $url = preg_replace('#(/[^/]*)$#','/',$url); return $url.$sub; } function crawl($url){ if( round(_time()-$GLOBALS['start_time']) > CRAWL_TIME_OUT || _mem() > MEMORY_LIMIT ){ return; } global $urls; if( empty($cont)){ if( !($cont = &get_url($url)) ) return false; elseif( isset($cont['new_url']) ){ if(in_array( $cont['new_url'] , $urls) ) return; $url = $cont['new_url']; } } $urls[] = $url; $matches = array(); preg_match_all('#\s+(?:href|src|URL|action)\s*=\s*["\']?((?!mailto:|news:|javascript:|ftp:|telnet:|callto:|"|ed2k:)[^"\'\#\s>]+)#is', $cont['content'], $matches); for( $i=0; $i<count($matches[1]); $i++ ){ $sub = $matches[1][$i]; if(preg_match('#\.(ico|png|jpg|gif|css|js)(\?[^\?/]*)?$#i', $sub) ) #excluding graphics continue; $base = array(); preg_match('#<base\s+[^>]*\s+href\s*=\s*["\']?(http://[^"\'\#\s>]*)#i', $cont['content'], $base); $curl = make_url( (empty($base[1]) ? $url : $base[1]) , $sub); if( !preg_match('#^https?://[^/]*'.str_replace('.','\.',SITE).'/[^\#]*#i', $curl) ) continue; if( !in_array($curl ,$urls ) ){ crawl($curl); } } } /*echo anything for debuging output*/ function _echo($val){ echo "<hr /><pre >"; if(is_array($val) || is_object($val)) $val = var_export($val,1); elseif(is_file($val)) $val = file_get_contents($val); elseif(empty($val)) $val = "nothing to echo"; echo htmlspecialchars($val); echo "</pre><hr />"; } function _error($msg, $title='NOTICE'){ echo "<div><strong>{$title}: </strong>{$msg}</div>"; } function _time() { list($usec, $sec) = explode(" ", microtime()); return ((float)$usec + (float)$sec); } function _mem() { if(function_exists("memory_get_peak_usage")) { $mem = memory_get_peak_usage(true); } else if(function_exists("memory_get_usage")) { $mem = memory_get_usage(true); }else return false; return round($mem / 1024 / 1024,2); } ?>