抓取網頁資料實作 @ King的幸福國度

測試網址為誠品網路書局…<a href="http://www.eslite.com/search_pro.aspx?query=9789868450639">http://www.eslite.com/search_pro.aspx?query=9789868450639</a>
 
<?php
/*----------擷取網頁-----------------*/
$string = "9789868450639";
$url = "http://www.eslite.com/search_pro.aspx?query=" . $string;
$ch = curl_init();
$timeout = 5;
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
//在需要用戶檢測的網頁里需要增加下面兩行
//curl_setopt($ch, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
//curl_setopt($ch, CURLOPT_USERPWD, US_NAME.":".US_PWD);
$contents = curl_exec($ch);
curl_close($ch);
//echo $contents;

/*----------step1 抓圖-----------------*/
//取得指定位址的內容，並儲存至text
$text=$contents;
//去除換行及空白字元（序列化內容才需使用）
//$text=str_replace(array("\r","\n","\t","\s"), '', $text);
//取出div標籤且id為PostContent的內容，並儲存至陣列match
preg_match('/<div[^>]*class="box_mid_none_1"[^>]*>(.*?) <\/div>/si',$text,$match);
//印出match[0]
//print($match[0]);印出指定div的內容
//取得第一個img標籤，並儲存至陣列match（regex語法與上述同義）
preg_match('/<img[^>]*>/Ui', $match[0], $img);
//印出match
echo "書的封面是這個：".$img[0]."";

/*----------step2 抓書名-----------------*/
//去除換行及空白字元（序列化內容才需使用）
//$text=str_replace(array("\r","\n","\t","\s"), '', $text);

//取出div標籤且id為PostContent的內容，並儲存至陣列match
preg_match('/<h3[^>]*class="tn15"[^>]*>(.*?) <\/h3>/si',$text,$book_name);

//印出match[0]
//echo "書的名字是這個：".$book_name[0]."";

preg_match('/<a[^>]*[^>]*>(.*?) <\/a>/si',$book_name[0],$book_a);

echo "連結的內容是這個：".$book_a[0];
echo " ";
/*----------step3 -----------------*/
preg_match('/<td[^>]*style="color:#000"[^>]*>(.*?) <\/td>/si',$text,$td);

print_r ($td);

?>