Golang·采集网页元素

Golang·采集使用chromedp + goquery 简单爬取动态数据网页元素

1、脚本：

终端载入： go get github.com/chromedp/chromedp

2、方法涵

// 获取网站上爬取的数据
func GetHttpHtmlContent(url string, selector string, sel interface{}) (string, error) {
    options := []chromedp.ExecAllocatorOption{
        chromedp.Flag("headless", true), // debug使用
        chromedp.Flag("blink-settings", "imagesEnabled=false"),
        chromedp.UserAgent(`Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36`),
    }
    //初始化参数，先传一个空的数据
    options = append(chromedp.DefaultExecAllocatorOptions[:], options...)

    c, _ := chromedp.NewExecAllocator(context.Background(), options...)

    // create context
    chromeCtx, cancel := chromedp.NewContext(c, chromedp.WithLogf(log.Printf))
    // 执行一个空task, 用提前创建Chrome实例
    chromedp.Run(chromeCtx, make([]chromedp.Action, 0, 1)...)

    //创建一个上下文，超时时间为10s
    timeoutCtx, cancel := context.WithTimeout(chromeCtx, 10*time.Second)
    defer cancel()

    var htmlContent string
    err := chromedp.Run(timeoutCtx,
        chromedp.Navigate(url),
        chromedp.WaitVisible(selector),
        chromedp.OuterHTML(sel, &htmlContent, chromedp.ByJSPath),
    )
    if err != nil {
        log.Println(err)
        return "", err
    }

    return htmlContent, nil
}

3、使用方式

// 采集网
func GetSportsLotter() {
    url := "http://www.baidu.com"
    html, err := GetHttpHtmlContent(url, "body > div", `document.querySelector("body")`)
    // 参数说明：url采集地址，网页标签名称 body > div.name > div.name2，固定内容 document.querySelector("body")`，
    if err != nil {
        log.Println(err)
    }
    // 成功取得HTML内容进行后续处理
    fmt.Println(html)
    
}

4、简易版（goquery.NewDocument 采集）

终端下载入 go get github.com/PuerkitoBio/goquery

doc, err := goquery.NewDocument(url)
    if err != nil {
        log.Println("爬取网页数据 异常:", err)
    }
    body := ""
    doc.Find("body").Each(func(i int, s *goquery.Selection) {
        band := s.Find(".mp-classiclist")
        Html, _ := band.Eq(0).Find("h3").Html() //采集元素
        text := band.Eq(0).Find("h3").Text() //采集文本
        img, exist := band.Find("div").Attr("src") //采集图片
            
        fmt.Println(Html, text)

        // band = strings.Replace(band, "\t\t", "", -1)
        // band = strings.Replace(band, "\n\n\n", "", -1)
        body = ""
    })

766 Views

Golang·采集网页元素

linwute

留下评论