<?php

define('MEMORY_LIMIT', 50);

define('SM_VERSION', 0.2);
if(!function_exists('file_put_contents')) {
	if (!defined('LOCK_EX')) define('LOCK_EX', 2);
	function file_put_contents($filename, $content, $flags = null) {
		if ( ( $fh = @fopen($filename, 'wb') ) === false) return false;
		if (!@flock($fh, LOCK_EX)) return false;
		if ((@fwrite($fh, $content)) === false) return false;
		@fclose($fh);
		return true;
	}
}
if (!function_exists('file_get_contents')) {
	function file_get_contents($filename) {
		if (false === $fh = @fopen($filename, 'rb') ) {
			return false;
		}
		clearstatcache();
		if ($fsize = @filesize($filename)) {
			$data = fread($fh, $fsize);
		} else {
			$data = '';
			while (!feof($fh)) {
				$data .= fread($fh, 8192);
			}
		}
		fclose($fh);
		return $data;
	}	
}
if ( !function_exists('sys_get_temp_dir') )
{
    // Based on http://www.phpit.net/
    // article/creating-zip-tar-archives-dynamically-php/2/
    function sys_get_temp_dir()
    {
        // Try to get from environment variable
        if ( !empty($_ENV['TMP']) )
        {
            return realpath( $_ENV['TMP'] );
        }
        else if ( !empty($_ENV['TMPDIR']) )
        {
            return realpath( $_ENV['TMPDIR'] );
        }
        else if ( !empty($_ENV['TEMP']) )
        {
            return realpath( $_ENV['TEMP'] );
        }

        // Detect by creating a temporary file
        else
        {
            // Try to use system's temporary directory
            // as random name shouldn't exist
            $temp_file = tempnam( md5(uniqid(rand(), TRUE)), '' );
            if ( $temp_file )
            {
                $temp_dir = realpath( dirname($temp_file) );
                unlink( $temp_file );
                return $temp_dir;
            }
            else
            {
                return FALSE;
            }
        }
    }
}
/*caching function*/
function & _cache($name, &$val=0, $checking = false){
	$retVal = false;
	if(!USE_CACHE) return $retVal;
	if(!is_dir(CACHE) ){
		_error('Cache Directory <strong>"'.CACHE.'"</strong> does not exist', 'WARNING');
		return $retVal;
	}
	$file = CACHE.$name;
	if(!empty($val)){
		if( !@file_put_contents($file,serialize($val),LOCK_EX) ){
			_error('Could not write cache, please check Cache folder <strong>"'.CACHE.'"</strong> is writtable',  'WARNING');
		}else{
			$retVal = true;
		}
	}elseif( file_exists($file) && time()-filemtime($file) < CACHE_TTL )
		if($checking)
			$retVal = true;
		elseif( !($cont=@unserialize(@file_get_contents($file))) ){
			_error('Could not get cache!',  'WARNING');
		}else
			return $cont;

	return $retVal;
}

/*retrieves urls contents*/
/*using curl library is much faster*/
function & get_url($url ){
	#checking for cached version
	if( $cont =& _cache(md5($url)) ) return $cont;
	$cont = array();
	$fbug = false;
	$cookie_file = sys_get_temp_dir().'/smc_cookies';
	$user_agent = "Sitemaps Creator ".SM_VERSION."(compatible; sitemapcreatorbot/".SM_VERSION."; +http://gadelkareem.com/) ";
	if(strpos($url,'http://')===false)
		$url = 'http://'.$url;
	ini_set('user_agent', $user_agent);
	
	if( USE_FOPEN )
		$cont['content']= trim(file_get_contents($url));
	elseif( !USE_CURL || !function_exists('curl_init')){
		$parse = parse_url($url);
		$host = $parse['host'];
		if ( empty($parse['port']) ) {
			if ($parse['scheme'] == 'https') {
				$port = "443";
			} else {
				$port = "80";
			}
		}else
			$port = $parse['port'];
		
		$cookies = file_exists($cookie_file) ? @file_get_contents($cookie_file) : '';
		if ( ($fp = @fsockopen($host, $port, $errno, $errstr, CONNECT_TIME_OUT)) === false) {
            switch($errno) {
				case -3: _error( 'Socket creation failed (-3)', 'WARNING');
				case -4: _error( 'DNS lookup failure (-4)', 'WARNING');
				case -5: _error( 'Connection refused or timed out (-5)', 'WARNING');
				default: _error( 'Connection failed ('.$errno.') '.$errstr, 'WARNING');
			}
			_cache(md5($url), ($cont=array()) );
			return $fbug;
        }
        socket_set_timeout($fp, CONNECT_TIME_OUT);
		#code from http://enarion.net/google/ crawler class
		$query_encoded = '';
		if (!empty($parse['query']) ) {
			$query_encoded = '?';
			foreach (split('&', $parse['query']) as $id => $quer) {
				$v = split('=', $quer);
				if (!empty($v[1])) {
					$query_encoded .= $v[0].'='.rawurlencode(urldecode($v[1])).'&';
				} else {
					$query_encoded .= $v[0].'&';
				}
			}
			$query_encoded = substr($query_encoded, 0, strlen($query_encoded) - 1);
			$query_encoded = str_replace('%2B','+', $query_encoded);
		}

		$get = "GET {$parse['path']}{$query_encoded} HTTP/1.1\r\n";
		$get .= "Host: {$host}\r\n";
		$get .= "User-Agent: {$user_agent})\r\n";
		$get .= "Referer: {$parse['scheme']}://{$host}{$parse['path']}\r\n";
		$get .= "Connection: close\r\n\r\n";
		socket_set_blocking($fp, true);
		fwrite($fp, $get);

		$res = '';
		$head_done = false;
	
		$currentHeader = '';		
		while ( '' != ($line=trim(fgets($fp, 1024))) ) {
			if ( false !== ($pos=strpos($line, ':')) ) 
				$header[str_replace('-', '_', strtolower(substr($line, 0, $pos)))] = trim(substr($line, $pos+1));
			elseif( preg_match('#\s([0-9]{3})\s#i', $line, $code) )
				$header['http_code'] = intval($code[0]);
		}
		

		$header['url'] = $url;
		if ( !empty($header['location']) ){
			$cont = & get_url($header['location']);
			$header['url'] = $header['location'];
		}

		if (isset($header['transfer_encoding']) && $header['transfer_encoding'] == 'chunked') {
			$chunk = hexdec(fgets($fp, 1024));
		} else {
			$chunk = -1;
		}
			
		$res = '';
		while ($chunk != 0 && !feof($fp)) {
		    if ($chunk > 0){
		         $part = fread($fp, $chunk);
		         $chunk -= strlen($part);
		         $res .= $part;
		         if ($chunk == 0){
		             if (fgets($fp, 1024) != "\r\n") ;
		             $chunk = hexdec(fgets($fp, 1024));
		         }
		    } else {
		         $res .= fread($fp, 1024);
		    }
		}
		fclose($fp);
		$cont['content'] = $res;

	}else{
		#curl exists
		$options = array(
			CURLOPT_RETURNTRANSFER => true,
			CURLOPT_FOLLOWLOCATION => true,
			CURLOPT_USERAGENT      => $user_agent,
			CURLOPT_CONNECTTIMEOUT => CONNECT_TIME_OUT,
			CURLOPT_TIMEOUT        => CONNECT_TIME_OUT,
			CURLOPT_MAXREDIRS      => 10,
			CURLOPT_COOKIEJAR      => $cookie_file,
			CURLOPT_COOKIEFILE     => $cookie_file,
		);
		$ch = curl_init($url);
		curl_setopt_array( $ch, $options );
		$cont['content'] = trim(curl_exec($ch));
		$err     = curl_errno( $ch );
		$errmsg  = curl_error( $ch );
		if($err){
			_error("{$errmsg} for URL <strong>{$url}</strong>", 'ERROR'.$err);
			_cache(md5($url), ($cont=array()) );
			return $fbug;
		}
		$header = curl_getinfo($ch);
		curl_close($ch);
	}
	if(isset($header)){
		if ( stripos($header['content_type'], 'text') === false ){
			_error("Document type is <strong>{$header['content_type']}</strong> for URL <strong>{$url}</strong>");
			_cache(md5($url), ($cont=array()) );
			return $fbug;
		}
		if ( $header['http_code'] != 200 ){
			#http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html
			_error("<strong>{$header['http_code']}</strong> Found for URL <strong>{$url}</strong>"); 
		}
		if ( $header['url'] != $url ){
			$url = $cont['new_url'] = $header['url'];
		}
	}
	#caching
	_cache(md5($url), $cont);
	#if(file_exists($cookie_file)) @unlink($cookie_file);
	#_echo($cookie_file);
	return $cont;
}



function cleanurl($url){
	return preg_replace("/(sid=[\w\d]+)|(\?$)/i", "", str_replace("&amp;", "&", $url));
}






/*reformats the urls */
function make_url($url,$sub=''){
	$sub = trim($sub);
	
	if( preg_match('#^([^:/]+://)(www\.)?([^/]+)/?([^\#]*)$#i', $sub, $matches) ){
		return cleanurl( (USE_WWW ? $matches[1].$matches[2].$matches[3].'/'.$matches[4] : $matches[1].$matches[3].'/'.$matches[4]) );
	}
		
	if($sub=='' || $sub == '/')
		return $url;
		
		
	
	while( preg_match('#^/?\.+/#', $sub) ){
		$sub = preg_replace('#^/?\.+/#','/',$sub);
		$url = preg_replace('#(/[^/]+)(/[^/]+)$#','\\1',$url);
	}
	
	
	if( $sub{0} == '/' )
		return cleanurl('http://'.SITE.$sub);
	

	$url = preg_replace('#(/[^/]*)$#','/',$url);

	return $url.$sub;

}

function crawl($url){
	if( round(_time()-$GLOBALS['start_time']) > CRAWL_TIME_OUT  || _mem() > MEMORY_LIMIT ){
		return;
	}
	global $urls;
	
	if( empty($cont)){
		if( !($cont = &get_url($url)) )
			return false;
		elseif( isset($cont['new_url']) ){
			if(in_array( $cont['new_url'] , $urls) ) return;
			$url = $cont['new_url'];
		}
	}
	
	$urls[] = $url;
		
	$matches = array();
	preg_match_all('#\s+(?:href|src|URL|action)\s*=\s*["\']?((?!mailto:|news:|javascript:|ftp:|telnet:|callto:|&quot;|ed2k:)[^"\'\#\s>]+)#is', $cont['content'], $matches);
	
	for( $i=0; $i<count($matches[1]); $i++ ){
		$sub = $matches[1][$i];
		if(preg_match('#\.(ico|png|jpg|gif|css|js)(\?[^\?/]*)?$#i', $sub) )	#excluding graphics
			continue;
			
		$base = array();
		preg_match('#<base\s+[^>]*\s+href\s*=\s*["\']?(http://[^"\'\#\s>]*)#i', $cont['content'], $base);
		
		$curl = make_url( (empty($base[1]) ? $url : $base[1]) , $sub);
		if( !preg_match('#^https?://[^/]*'.str_replace('.','\.',SITE).'/[^\#]*#i', $curl) )
			 continue;
		
		if( !in_array($curl ,$urls )  ){
			crawl($curl);
		}
	}
	

}

/*echo anything for debuging output*/
function _echo($val){	
	echo "<hr /><pre >";
	if(is_array($val) || is_object($val)) $val = var_export($val,1);
	elseif(is_file($val)) $val = file_get_contents($val);
	elseif(empty($val)) $val = "nothing to echo";
	echo htmlspecialchars($val);
	echo "</pre><hr />";
}
function _error($msg, $title='NOTICE'){
	echo "<div><strong>{$title}: </strong>{$msg}</div>";

}
function _time()
{
   list($usec, $sec) = explode(" ", microtime());
   return ((float)$usec + (float)$sec);
}
function _mem() {
	if(function_exists("memory_get_peak_usage")) {
		$mem = memory_get_peak_usage(true);
	} else if(function_exists("memory_get_usage")) {
		$mem = memory_get_usage(true);
	}else
		return false;
	return round($mem / 1024 / 1024,2);
}
?>