要完成百度收录检测,需要反向爬取百度网站,现有情况下,不论使用哪种语言,都会使用 http request 的方式去完成“百度收录检测”,都很容易触发“百度安全验证”,之前实现过使用代理IP、使用第三方API等方式,会有不正确或不稳定的情况。本次分享的源码,使用 golang chromedp 来实现,即模拟浏览器来百度的反向爬取实现。

项目开源地址:

https://github.com/zituocn/rich-api

以下代码使用golang的gow框架,很容易转换为您需要的代码。

一、主文件:main.go

package main

import (
	"github.com/zituocn/gow"
	"github.com/zituocn/logx"
	"github.com/zituocn/richcms/src/api/conn"
	"github.com/zituocn/richcms/src/api/router"
)

func init() {
	conn.InitLog()
}

func main() {
	r := gow.Default()
	r.SetAppConfig(gow.GetAppConfig())
	router.APIRouter(r)
	err := r.Run()
	if err != nil {
		logx.Panic(err)
	}
}

二、http handler文件:baidu.go

package handler

import (
	"fmt"
	"github.com/zituocn/gow"
	"github.com/zituocn/richcms/src/api/service"
	"strings"
)

func BaiduCheck(c *gow.Context) {
	keyword := c.GetString("url")
	keyword = strings.TrimSpace(keyword)
	if keyword == "" {
		c.DataJSON(1, "请传入URL参数")
		return
	}
	bs := new(service.BaiduService)
	ret, err := bs.CheckURL(keyword)
	if err != nil {
		c.DataJSON(1, fmt.Sprintf("查询错误:%v", err))
		return
	}

	c.DataJSON(ret)
}

三、核心文件:baidu-service.go

package service

import (
	"bytes"
	"context"
	"fmt"
	"github.com/PuerkitoBio/goquery"
	"github.com/chromedp/chromedp"
	"github.com/zituocn/gow/lib/config"
	"net"
	"strings"
	"time"
)

var (
	ws = "ws://127.0.0.1:9222"
)

type BaiduService struct {
}

type Result struct {
	Url         string `json:"url"`
	Record      bool   `json:"record"`
	Title       string `json:"title"`
	Datetime    string `json:"datetime"`
	Description string `json:"description"`
	Tips        string `json:"tips"`
}

func (m *BaiduService) CheckURL(keyword string) (ret *Result, err error) {
	if keyword == "" {
		return
	}
	addr := "https://www.baidu.com"
	timeCtx, cancel := context.WithTimeout(getChromeCtx(), time.Second*15)
	defer cancel()

	var body string
	var record bool
	var title, description, recordTime string

	ret = &Result{
		Url:    keyword,
		Record: false,
	}
	waitTime := config.DefaultInt("wait", 1000)
	err = chromedp.Run(timeCtx,
		chromedp.Navigate(addr),
		chromedp.SetValue(`#kw`, keyword, chromedp.ByID),
		chromedp.Click(`su`, chromedp.ByID),
		chromedp.WaitVisible(`body`, chromedp.ByQuery),
		chromedp.Sleep(time.Duration(waitTime)*time.Millisecond),
		chromedp.OuterHTML(`body`, &body, chromedp.ByQuery),
	)
	if err != nil {
		return
	}

	if strings.Contains(body, "百度安全验证") {
		err = fmt.Errorf("出现 百度安全验证")
		return
	}

	if strings.Contains(body, "https://wappass.baidu.com/static/captcha") {
		err = fmt.Errorf("出现百度安全验证")
		return
	}

	if strings.Contains(body, "未找到相关结果") || strings.Contains(body, "没有找到该URL") {
		return
	}

	if strings.Contains(body, keyword) {
		record = true
	}

	reader := bytes.NewReader([]byte(body))
	doc, err := goquery.NewDocumentFromReader(reader)
	if err != nil {
		return nil, err
	}

	ss := doc.Find(".c-container").First()
	ss.Find(".c-title").Each(func(i int, s *goquery.Selection) {
		title = strings.TrimSpace(s.Text())
	})
	ss.Find(".c-gap-top-small").Find("span").Each(func(i int, s *goquery.Selection) {
		if i == 0 {
			recordTime = strings.TrimSpace(s.Text())
		}
		if i == 1 {
			description = strings.TrimSpace(s.Text())
		}
	})

	if recordTime == "" || description == "" {

		ss.Find(".c-span9").Find("span").Each(func(i int, s *goquery.Selection) {
			if i == 0 {
				recordTime = strings.TrimSpace(s.Text())
			}
			if i == 1 {
				description = strings.TrimSpace(s.Text())
			}
		})
	}

	ret.Record = record
	ret.Title = title
	ret.Description = description
	ret.Datetime = recordTime
	if ret.Record {
		ret.Tips = "已收录"
	}

	return
}

func getChromeCtx() context.Context {
	var chromeCtx context.Context
	allowOpts := chromedp.DefaultExecAllocatorOptions[:]
	allowOpts = append(allowOpts,
		chromedp.Flag("headless", true),                                 //关掉浏览器窗口
		chromedp.Flag("enable-automation", false),                       //是否显示自动化测试标识
		chromedp.Flag("disable-blink-features", "AutomationControlled"), //禁止掉chrome标识
		chromedp.Flag("blink-settings", "imageEnable=true"),             //不渲染图片
		chromedp.Flag("ignore-certificate-errors", true),                //忽略错误
		chromedp.Flag("disable-web-security", true),                     //禁用网络安全标志
		chromedp.Flag("disable-gpu", true),
		chromedp.DisableGPU,
	)

	if checkChromePort() {
		c, _ := chromedp.NewRemoteAllocator(context.Background(), ws)
		chromeCtx, _ = chromedp.NewContext(c)
	} else {
		c, _ := chromedp.NewExecAllocator(context.Background(), allowOpts...)
		chromeCtx, _ = chromedp.NewContext(c)
	}

	return chromeCtx
}

func checkChromePort() bool {
	addr := net.JoinHostPort("", "9222")
	conn, err := net.DialTimeout("tcp", addr, 1*time.Second)
	if err != nil {
		return false
	}
	defer conn.Close()
	return true
}

代码中使用的chrome远程控制,通过docker image来完成,请执行以下代码:

docker pull chromedp/headless-shell
docker run -itd --restart=always --name chromedp -p 9222:9222  chromedp/headless-shell