sRootLink = $sRootLink;
$this->iCountOfPages = $iCountOfPages;
$this->iDeep = 0;
$this->sDomain = "";
$this->sScheme = "";
}
public function getAllLinks()
{
$this->recParseLinks($this->sRootLink);
$this->printLinks();
$this->saveToCSV();
}
private function printLinks()
{
echo "Web-site: www." . $this->sDomain . "Count of links: " . count($this->linkArray) . "";
foreach($this->linkArray as $element)
echo "" . $element . "" . "
";
}
private function saveToCSV()
{
$fp = fopen("allLinksFromYourSite.csv", "w");
fwrite($fp, "Web-site: $this->sDomain" . PHP_EOL);
fwrite($fp, "Count of links: " . count($this->linkArray) . PHP_EOL . PHP_EOL);
foreach($this->linkArray as $element)
fwrite($fp, $element . PHP_EOL);
fclose($fp);
}
private function recParseLinks($link)
{
if(strlen($link) <= 1)
return;
if($this->iDeep == 0)
{
$d = parse_url($link);
if($d != false)
{
$this->sDomain = $d['host'];
$this->sScheme = $d['scheme'];
}
else
return;
}
$this->iDeep++;
$doc = new DOMDocument();
$doc->loadHTML(file_get_contents($link));
$elements = $doc->getElementsByTagName('a');
foreach($elements as $element)
{
if(count($this->linkArray) >= $this->iCountOfPages)
return;
$links = $element->getAttribute('href');
if($links[0] == '/' || $links[0] == '?')
$links = $this->sScheme . "://" . $this->sDomain . $links;
$p_links = parse_url($links);
if($p_links == FALSE)
continue;
if($p_links["host"] != $this->sDomain)
continue;
if(!$this->linkExists($links) && strlen($links) > 1)
{
$this->linkArray[] = $links;
if($this->iDeep < 4)
{
$this->recParseLinks($links);
}
}
}
$this->iDeep--;
}
private function linkExists($link)
{
foreach($this->linkArray as $element)
if($element == $link)
return true;
return false;
}
}
$parseLinksObject = new ParseLinks('https://web.archive.org/web/20130702165710/http://www.simps.ru/', 3000);
$parseLinksObject->getAllLinks();
Комментариев нет:
Отправить комментарий