取HTTP网页源码

1
2
3
4
5
6
7
8
9
10
11
12
$ret = file_get_contents('要采集的网页URL');

// 若需要从页面中获取内容,可以用正则匹配
$begin=change_match_string('匹配开头的字符串');
$end=change_match_string('匹配结尾的字符串');
$p = "{$begin}(.*){$end}";

// 使用正则进行匹配,[0]是全部,1开始是括号的
if (preg_match($p,$ret,$rs))
return $rs[1];
else
return false;

乱码编码转换(网址表单):

1
$getcontent = iconv("gb2312", "utf-8",$contents);

乱码网页编码

1
$html = iconv('gb2312', 'utf-8//IGNORE',$html);

取HTTPS网页源码

若用 file_get_contents() 函数会报错:

file_get_contents(): Unable to find the wrapper “https” - did you forget to enable it when you configured PHP

所以用函数:curl_setopt()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
<?php 
header("Content-Type: text/html;charset=utf-8");

function getHttps($url){
//初始化
$ch = curl_init();
//设置选项,包括URL
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE); // https请求 不验证证书和hosts
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);

$output = curl_exec($ch); //执行并获取HTML文档内容
$str = htmlspecialchars($output);//转换为源代码形式
//释放curl句柄
curl_close($ch);
return $str ;
}

$url ='https://www.hao123.com/index.html';
echo getHttps($url);
?>