<?php if ( ! defined('BASEPATH')) exit('No direct script access allowed');

class Crawler extends CI_Controller
{
	private $domains = array();
	private $mydomain = "";
	private $crawled = array();

	public function __construct()
	{
		parent::__construct();

		// Force SSL if the constant is set but the HTTPS server variable is not set or set but false.
		if(defined("FORCE_SSL") && FORCE_SSL && preg_match('/^http:/', current_url()) && !is_cli())
		{
			if(defined("LOGGING")) { error_log(PRODUCT_NAME . ": Redirecting to HTTPS."); }
			redirect(str_replace("http://", "https://", current_url()), 'auto', 301);
		}

		$this->current_user = get_user();
		$this->data["user"] = $this->current_user;
	}

	public function linkcheck($manual_action=false)
	{
		if($manual_action)
		{
			admin_login_required();

			if(!$this->current_user->has(ACTION_SEO))
			{
				return $this->denied($this->data);
			}
		}
		else if(php_sapi_name() !== "cli")
		{
			die("This function can only be run from the command line or as an authenticated user.");
		}

		$this->load->model("crawler_issue");
		$this->crawler_issue->reset();
		$this->mydomain = LIVE_SITE;
		$this->crawl_links(LIVE_SITE);

		if($manual_action)
		{
			redirect("/admin/crawler_issues");
		}
	}

	public function sitemap($manual_action=false)
	{
		if($manual_action)
		{
			admin_login_required();

			if(!$this->current_user->has(ACTION_SEO))
			{
				return $this->denied($this->data);
			}
		}
		else if(php_sapi_name() !== "cli")
		{
			die("This function can only be run from the command line or as an authenticated user.");
		}

		$this->load->model(array("page", "content_value", "shop/shop_route", "shop/shop_product"));
		$this->mydomain = LIVE_SITE;

		$content = '<?xml version="1.0" encoding="UTF-8"?>' . "\n" .
		           '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">' . "\n";

		$content .= "\t<url>\n" .
		            "\t\t<loc>" . LIVE_SITE . "</loc>\n" .
		            "\t\t<changefreq>monthly</changefreq>\n" .
		            "\t\t<priority>1.00</priority>\n" .
		            "\t\t<lastmod>" . date("Y-m-d") . "</lastmod>\n";

		Mainframe::init();

	    $site = Mainframe::site();
	    Mainframe::LoadPage($site->site_id, $site->default_page_id);
	    $main_page = Mainframe::content(Mainframe::page(), false, true);

        preg_match_all('/<img[^>]+>/i', $main_page, $imgs);
	    foreach ($imgs[0] as $img)
	    {
		    if ((strpos($img, 'module_placeholder') === false))
            {
                preg_match('/src="([^"]+)/i', $img, $img_data);

                $content .= "\t\t<image:image>\n" .
		                    "\t\t\t<image:loc>" . LIVE_SITE . $img_data[1] . "</image:loc>\n" .
		                    "\t\t</image:image>\n";
            }
	    }

		$content .= "\t</url>\n";

		// $content .= $this->sitemap_crawl($this->mydomain);

		$pages 	= $this->page->loadBySiteID($site->site_id, null);
		$routes = $this->shop_route->loadAllBySiteID($site->site_id);

		foreach($pages as $p)
		{
			if($p->published && $p->url != "index" && $p->url != "home" && $p->url != "search")
			{

				$content .= "\t<url>\n" .
				            "\t\t<loc>" . LIVE_SITE . "/" . $p->url . "</loc>\n";
				if ($p->sitemap_changefreq != "")
				{
				    $content .= "\t\t<changefreq>" . $p->sitemap_changefreq . "</changefreq>\n";
				}
				if ($p->sitemap_priority != "")
				{
				    $content .= "\t\t<priority>" . $p->sitemap_priority . "</priority>\n";
				}
				$content .= "\t\t<lastmod>" . ($p->modified ? date("Y-m-d", strtotime($p->modified)) : date("Y-m-d")) . "</lastmod>\n";

				$p_content = $this->content_value->LoadByPageKey($p->page_id, "content1")->value;
		        preg_match_all('/<img[^>]+>/i', $p_content, $imgs);
			    foreach ($imgs[0] as $img)
			    {
    			    if ((strpos($img, 'module_placeholder') === false))
                    {
                        preg_match('/src="([^"]+)/i', $img, $img_data);

                        $content .= "\t\t<image:image>\n" .
    			                    "\t\t\t<image:loc>" . LIVE_SITE . $img_data[1] . "</image:loc>\n" .
    			                    "\t\t</image:image>\n";
                    }
			    }

				$content .= "\t</url>\n";
			}
		}
		foreach($routes as $r)
		{
			$mod = date("Y-m-d");

			if($r->product_id)
			{
				$p = new Shop_product();
				$p->load($r->product_id);

				if($p->last_modified)
				{
					$mod = date("Y-m-d", strtotime($p->last_modified));
				}
			}
			$content .= "\t<url>\n" .
			            "\t\t<loc>" . LIVE_SITE . "/" . $r->url . "</loc>\n" .
			         // "\t\t<changefreq>daily</changefreq>\n" .
			         // "\t\t<priority>1.00</priority>\n" .
			            "\t\t<lastmod>" . $mod . "</lastmod>\n" .
			            "\t</url>\n";
		}

		$content .= '</urlset>';

		file_put_contents(ABSOLUTE_PATH . "/sitemap.xml", $content);

		if($manual_action)
		{
			redirect("/admin/seo/1");
		}
	}

	private function sitemap_crawl($url)
	{
		$this->crawled[] = $url;

		$ch = curl_init($url);
		curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
		curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0);
		$content = curl_exec($ch);
		curl_close($ch);

		if($content === false)
		{
			return "";
		}

		$links = array();
		preg_match_all('/<a.*?href=(?:"|\')(.*?)(?:"|\']).*?>/i', $content, $links);
		$content = "";

		foreach($links[1] as $link)
		{
			if(trim($link) == "/" || is_file(urldecode($link)) || preg_match('/(mailto|tel):/i', $link))
			{
				continue;
			}

			$url = false;

			// Link starts with http(s)://(www.)domain.com
			if(preg_match('/^https?:\/\/(?:www\.)?' . str_replace(array(".", "/"), array("\.", "\/"), $this->mydomain) . '/i', $link))
			{
				// Absolute URL pointing to our site.
				$url = $link;
			}
			// Link does not start with http(s)://
			else if(preg_match('/^(?!https?:\/\/).*/i', $link))
			{
				// Relative URL pointing to our site.
				// Add our domain name to the front and ensure we don't end up with two slashes after concatenation.
				$url = str_replace($this->mydomain . "//", $this->mydomain . "/", $this->mydomain . "/" . $link);
			}
			// Link does not contain our domain name and is not a relative URL.
			else
			{
				// Likely an external link.  Ignore.
				$url = false;
			}

			// Don't add files to the site map.
			if(preg_match('/\.(jpg|jpeg|png|gif|doc|docx|xls|xlsx|pdf)$/i', $link) > 0)
			{
				$url = false;
			}

			// Check if the URL ends with a slash.
			if($url && preg_match('/\/$/', $url))
			{
				// If it does, remove it.
				$url = substr($url, 0, -1);
			}

			// Check if the URL has a #
			if($url && strpos($url, "#") !== false)
			{
				// If it does, remove it.
				$url = substr($url, 0, strpos($url, "#"));
			}

			// If the URL has any quotes in it, something has gone wrong (probably bad HTML).
			if(preg_match('/(\'|")/', $url))
			{
				return "";
			}

			if($url && !in_array($url, $this->crawled))
			{
				// Server path to file.
				// $path = ABSOLUTE_PATH . str_replace(LIVE_SITE, "", $url);
				// $modified = filemtime($path);
				$slashes = preg_match_all('/\//', $url, $x);
				if($slashes === false)
				{
					$slashes = 0;
				}

				$number_of_slashes = 1 + ($slashes - 2);
				$priority = number_format(1 / $number_of_slashes, 2);
				// TODO: find a way to calculate how often files are changing.
				$changefreq = "daily";

				$content .= "\t<url>\n\t\t<loc>$url</loc>\n\t\t<changefreq>$changefreq</changefreq>\n\t\t<priority>$priority</priority>\n\t\t<lastmod>" . date("Y-m-d") . "</lastmod>\n\t</url>\n";
				$content .= $this->sitemap_crawl($url);
			}
		}

		return $content;
	}

	private function crawl_links($url)
	{
		$url 				= str_replace(" ", "%20", $url);
		$this->crawled[] 	= $url;

		$ch = curl_init($url);
		curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
		curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0);
		$content = curl_exec($ch);
		curl_close($ch);

		$links 				= array();
		$bad_links 			= array();
		$redirect_links 	= array();
		$bad_images 		= array();
		$redirect_images 	= array();
		$url_info 			= null;
		$domains_to_ignore 	= array('facebook\.com', 'ca\.linkedin\.com', 'fonts\.google\.com', 'plus\.google\.com');

		preg_match_all('/href=(?:"|\')(.*?)(?:"|\'])/i', $content, $links);

		foreach($links[1] as $link)
		{
			set_time_limit(30);
			if(trim($link) == "/" || is_file(urldecode($link)) || preg_match('/(mailto|tel):/i', $link))
			{
				continue;
			}

			$url_check = false;
			$internal = false;
			$skip = false;

			// Link starts with http(s)://(www.)domain.com
			if(preg_match('/^https?:\/\/(?:www\.)?' . str_replace(array(".", "/"), array("\.", "\/"), $this->mydomain) . '/i', $link))
			{
				// Absolute URL pointing to our site.
				$url_check = $link;
				$internal = true;
			}
			// Link does not start with http(s)://
			else if(preg_match('/^(?!https?:\/\/).*/i', $link))
			{
				// Relative URL pointing to our site.
				// Add our domain name to the front and ensure we don't end up with two slashes after concatenation.
				$url_check = str_replace($this->mydomain . "//", $this->mydomain . "/", $this->mydomain . "/" . $link);
				$internal = true;
			}
			// Link does not contain our domain name and is not a relative URL.
			else
			{
				// Likely an external link.
				$url_check = $link;
				$internal = false;
			}

			// Check if the URL ends with a slash.
			if($url_check && preg_match('/\/$/', $url_check))
			{
				// If it does, remove it.
				$url_check = substr($url_check, 0, -1);
			}

			// Check if the URL has a #
			if($url && strpos($url, "#") !== false)
			{
				// If it does, remove it.
				$url = substr($url, 0, strpos($url, "#"));
			}

			// If the URL has any quotes in it, something has gone wrong (probably bad HTML).
			if(preg_match('/(\'|")/', $url))
			{
				continue;
			}

			if($url_check && !in_array($url_check, $this->crawled))
			{
				// Don't check email links
				if(preg_match("/mailto:/i", $url_check))
				{
					continue;
				}

				// Don't check JavaScript links
				if(preg_match("/javascript:/i", $url_check))
				{
					continue;
				}

				$skip = false;

				// Links starting with certain domains should be ignored because they return false positives.
				foreach($domains_to_ignore as $d)
				{
					if(preg_match('/' . $d . '/i', $url_check))
					{
						$skip = true;
					}
				}

				if($skip)
				{
					continue;
				}

				$url_check 	= str_replace(" ", "%20", $url_check);
				$good_link 	= check_url($url_check, $url_info);

				if($good_link)
				{
					//valid but might have required redirects
					if($url_info["redirect_time"] > 0)
					{
						// With a redirect, "url" will hold the final destination.
						if(substr($url_info["url"], -1) == "/")
						{
							$url_info["url"] = substr($url_info["url"], 0, -1);
						}

						$redirect_links[$url_check] = $url_info["url"];
					}

					// Now crawl it if its an internal link.
					if($internal)
					{
						$this->crawl_links($url_check);
					}
				}
				else if(!$good_link)
				{
					$bad_links[] = $url_check;
				}
			}
		}

		preg_match_all('/src=(?:"|\')(.*?)(?:"|\'])/i', $content, $links);

		foreach($links[1] as $link)
		{
			set_time_limit(30);
			if(trim($link) == "/" || is_file(urldecode($link)))
			{
				continue;
			}

			$url_check = false;

			// Link starts with http(s)://(www.)domain.com
			if(preg_match('/^https?:\/\/(?:www\.)?' . str_replace(array(".", "/"), array("\.", "\/"), $this->mydomain) . '/i', $link))
			{
				// Absolute URL pointing to our site.
				$url_check = $link;
			}
			// Link does not start with http(s)://
			else if(preg_match('/^(?!https?:\/\/).*/i', $link))
			{
				// Relative URL pointing to our site.
				// Add our domain name to the front and ensure we don't end up with two slashes after concatenation.
				$url_check = str_replace($this->mydomain . "//", $this->mydomain . "/", $this->mydomain . "/" . $link);
			}
			// Link does not contain our domain name and is not a relative URL.
			else
			{
				// Likely an external link.  Ignore.
				$url_check = false;
			}

			if($url_check && !in_array($url_check, $this->crawled))
			{
				$url_check = str_replace(" ", "%20", $url_check);

				if(!check_img($url_check))
				{
					$bad_images[] = $url_check;
				}
			}
		}

		foreach($bad_links as $link)
		{
			$issue = new Crawler_Issue();
			$issue->crawltime = date("Y-m-d H:i:s");
			$issue->page_url = $url;
			$issue->notes = "<p>Broken link or other HREF detected: <a href='$link' target='_blank'>$link</a><p>You should check this link immediately.</p></p>";
			$issue->severity = 2;
			$issue->save();
		}

		foreach($redirect_links as $url => $redirect)
		{
			$issue = new Crawler_Issue();
			$issue->crawltime = date("Y-m-d H:i:s");
			$issue->page_url = $url;
			$issue->notes = "<p>Redirected link or other HREF detected: <a href='$url' target='_blank'>$url</a> redirects to <a href='$redirect' target='_blank'>$redirect</a></p><p>Sometimes this can be caused by broken links.  If the link works, you can ignore this warning.</p>";
			$issue->severity = 1;
			$issue->save();
		}

		foreach($bad_images as $image)
		{
			$issue = new Crawler_Issue();
			$issue->crawltime = date("Y-m-d H:i:s");
			$issue->page_url = $url;
			$issue->notes = "<p>Broken image or other SRC detected: <a href='$image' target='_blank'>$image</a>";
			$issue->severity = 2;
			$issue->save();
		}

		// foreach($redirect_images as $image)
		// {
		// 	$issue = new Crawler_Issue();
		// 	$issue->crawltime = date("Y-m-d H:i:s");
		// 	$issue->page_url = $url;
		// 	$issue->notes = "";
		// 	$issue->severity = 0;
		// 	$issue->save();
		// }

		return $content;
	}
}
