PHP抓取微信公众号文章实战教程
安装
composer require symfony/dom-crawler
使用
use GuzzleHttp\Client;
use GuzzleHttp\Exception\GuzzleException;
use Illuminate\Support\Facades\Storage;
use Symfony\Component\DomCrawler\Crawler;
$url = ''; // 微信公众号文章链接
// 设置请求头
$client = new Client([
'timeout' => 10,
'verify' => false,
'headers' => [
'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
'Accept' => 'application/json; charset=utf-8',
'Referer' => $url,
],
]);
try {
$response = $client->get($url);
$html = $response->getBody()->getContents();
// 解析HTML
$crawler = new Crawler($html, $url);
// 解析数据
// 标题
$title = $crawler->filter('#activity-name')->text();
// 作者、发布时间、地址
$author = $crawler->filter('#js_name')->text();
$datetime = $crawler->filter('#publish_time')->text();
$address = $crawler->filter('#js_ip_wording')->text();
// 获取内容节点(不直接获取HTML)
$contentNode = $crawler->filter('#js_content');
$content = $contentNode->html();
return view('wechat',[
'title' => $title,
'author' => $author,
'datetime' => $datetime,
'address' => $address,
'content' => $content,
]);
}catch (GuzzleException $exception){
dd($exception->getMessage());
}
替换文章中的图片链接
// 获取内容节点(不直接获取HTML)
$contentNode = $crawler->filter('#js_content');
// 处理内容中的图片
$contentNode->filter('img')->each(function (Crawler $imgNode) {
$img = $imgNode->getNode(0);
if ($dataSrc = $img->getAttribute('data-src')) {
// 将 data-src 值赋给 src
$image_content = file_get_contents($dataSrc);
$image_name = md5($dataSrc).'.jpg';
Storage::disk('public')->put('images/wechat/'.date('Ymd').'/'. $image_name, $image_content);
$dataSrc = Storage::disk('public')->url('images/wechat/'.date('Ymd').'/'. $image_name);
$img->setAttribute('src', $dataSrc);
}
});