取HTTP网页源码
1 2 3 4 5 6 7 8 9 10 11 12
| $ret = file_get_contents('要采集的网页URL');
$begin=change_match_string('匹配开头的字符串'); $end=change_match_string('匹配结尾的字符串'); $p = "{$begin}(.*){$end}";
if (preg_match($p,$ret,$rs)) return $rs[1]; else return false;
|
乱码编码转换(网址表单):
1
| $getcontent = iconv("gb2312", "utf-8",$contents);
|
乱码网页编码
1
| $html = iconv('gb2312', 'utf-8//IGNORE',$html);
|
取HTTPS网页源码
若用 file_get_contents()
函数会报错:
file_get_contents(): Unable to find the wrapper “https” - did you forget to enable it when you configured PHP
所以用函数:curl_setopt()
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
| <?php header("Content-Type: text/html;charset=utf-8"); function getHttps($url){ $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);
$output = curl_exec($ch); $str = htmlspecialchars($output); curl_close($ch); return $str ; } $url ='https://www.hao123.com/index.html'; echo getHttps($url); ?>
|