воскресенье, 13 мая 2018 г.

parsing all links of website to csv file on php

sRootLink = $sRootLink; $this->iCountOfPages = $iCountOfPages; $this->iDeep = 0; $this->sDomain = ""; $this->sScheme = ""; } public function getAllLinks() { $this->recParseLinks($this->sRootLink); $this->printLinks(); $this->saveToCSV(); } private function printLinks() { echo "Web-site: www." . $this->sDomain . "
Count of links: " . count($this->linkArray) . "

"; foreach($this->linkArray as $element) echo "" . $element . "" . "
"; } private function saveToCSV() { $fp = fopen("allLinksFromYourSite.csv", "w"); fwrite($fp, "Web-site: $this->sDomain" . PHP_EOL); fwrite($fp, "Count of links: " . count($this->linkArray) . PHP_EOL . PHP_EOL); foreach($this->linkArray as $element) fwrite($fp, $element . PHP_EOL); fclose($fp); } private function recParseLinks($link) { if(strlen($link) <= 1) return; if($this->iDeep == 0) { $d = parse_url($link); if($d != false) { $this->sDomain = $d['host']; $this->sScheme = $d['scheme']; } else return; } $this->iDeep++; $doc = new DOMDocument(); $doc->loadHTML(file_get_contents($link)); $elements = $doc->getElementsByTagName('a'); foreach($elements as $element) { if(count($this->linkArray) >= $this->iCountOfPages) return; $links = $element->getAttribute('href'); if($links[0] == '/' || $links[0] == '?') $links = $this->sScheme . "://" . $this->sDomain . $links; $p_links = parse_url($links); if($p_links == FALSE) continue; if($p_links["host"] != $this->sDomain) continue; if(!$this->linkExists($links) && strlen($links) > 1) { $this->linkArray[] = $links; if($this->iDeep < 4) { $this->recParseLinks($links); } } } $this->iDeep--; } private function linkExists($link) { foreach($this->linkArray as $element) if($element == $link) return true; return false; } } $parseLinksObject = new ParseLinks('https://web.archive.org/web/20130702165710/http://www.simps.ru/', 3000); $parseLinksObject->getAllLinks();

wget start

wget -i *.txt