Extracting all links of a webpage is quite easy with simple HTML Dom and Curl. Just read the webpage with Curl and parse it with Dom to extract all links. Here is the script.
All Links will be stores in $urls array.
$url = "http://google.com";
$agent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13";
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);
curl_setopt($ch, CURLOPT_MAXREDIRS, 2);
curl_setopt($ch, CURLOPT_USERAGENT, $agent);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
curl_setopt($ch, CURLOPT_TIMEOUT, 20);
$html= curl_exec($ch);
curl_close($ch);
$dom = new DOMDocument();
$dom->loadHTML($html);
$hrefs = $dom->getElementsByTagName('a');
$urls=array();
for ($i = 0; $i < $hrefs->length; $i++) {
$href = $hrefs->item($i);
$urls[] = $href->getAttribute('href');
}
print_r($urls);
Posting Komentar