golang之http

go获取html页面

package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
	"net/http/httputil"
)

func main(){
	// 使用http.Get函数，传入url，得到一个*Response和一个error
	res, err := http.Get("https://tieba.baidu.com/p/5524106374?red_tag=3108778011")
	// 内容都在res.Body下面，res.Body相当于一个io.Reader记得要关闭
	defer res.Body.Close()
	if err!=nil{
		fmt.Println("err=",err)
		return
	}
	// 使用ioutil.ReadAll,接收一个io.Reader，将res.Body读取出来，得到一个[]byte和一个error
	html,err := ioutil.ReadAll(res.Body)
	if err!=nil{
		fmt.Println("err=",err)
		return
	}
	// 这里的html是[]byte，使用string转化一下
	fmt.Println(string(html))
	/*
	截取一部分：
	<!DOCTYPE html><!--STATUS OK--><html><head><meta name="keywords" content="百度贴吧,椎名真白资源,白原,图高"/
	 */

	//上述方式获取的是html页面，我们还可以获取相应报文,使用httputil.DumpResponse，这个函数需要接收一个*Response和一个bool类型，所以这里直接传入*Response类型的res,和一个bool类型即可
	// bool类型表示是否将res.Body里面的内容Dump出来，这里传入true
	html1, err := httputil.DumpResponse(res, true)
	if err!=nil{
		fmt.Println("err=",err)
	}
	// 这里的html1依旧是一个[]byte，需要转成string
	fmt.Println(string(html1))
	/*
	HTTP/1.1 200 OK
	Transfer-Encoding: chunked
	Connection: keep-alive
	Content-Type: text/html; charset=UTF-8
	Date: Fri, 16 Nov 2018 08:26:50 GMT
	P3p: CP=" OTI DSP COR IVA OUR IND COM "
	Server: Apache
	Set-Cookie: TIEBA_USERTYPE=5c40c0e62e72fae312d68f2d; expires=Thu, 31-Dec-2020 15:59:59 GMT; path=/; domain=tieba.baidu.com
	Set-Cookie: wise_device=0; path=/
	Set-Cookie: BAIDUID=E09822172FC8F15E98583B8B2380A118:FG=1; expires=Sat, 16-Nov-19 08:26:50 GMT; max-age=31536000; path=/; domain=.baidu.com; version=1
	Tracecode: 16103786370354826762111616
	Tracecode: 16103786370976641034111616
	Vary: Accept-Encoding
	X-Xss-Protection: 1; mode=block
	 */

}

go进行post请求

package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
	"net/url"
	"strings"
)

func main(){
	//如何模拟登录，可以使用http.Post，接收三个参数
	//func Post(url, contentType string, body io.Reader) (resp *Response, err error)
	//contentType一般写"application/x-www-form-urlencoded"即可
	//body是请求体，类似于Python中requests.post里的data,但这里只能接收一个参数，而且还是io.Reader类型
	res,err:=http.Post("xxxxx", "application/x-www-form-urlencoded", strings.NewReader("phone=12345421"))
	defer res.Body.Close()
	if err!=nil{
		fmt.Println("err=",err)
	}
	body,_:=ioutil.ReadAll(res.Body)
	fmt.Println(string(body))

	//那么如何接收多个参数呢？
	//可以使用http.PostForm
	//func PostForm(url string, data url.Values) (resp *Response, err error)
	//这里的data才真正意义上相当于requests.post里面的data
	//为什么这里的value还需要加上{}，url.Values就是这么定义的，type Values map[string][]string，需要传入一个切片
	res1,err1 := http.PostForm("xxxxxx", url.Values{"username":{"1245412154"}, "passwd":{"dadsadsadas"}})
	if err!=nil{
		fmt.Println("err1=",err1)
	}
	defer res1.Body.Close()
	body,_ = ioutil.ReadAll(res1.Body)
	fmt.Println(body)

}

以上就是go如何进行get和post请求，但是我们发现我们并没有加上请求头之类的，这样很容易被反爬虫，那么如何加上请求头之类的呢？

package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
	"strings"
)

func main(){
	url := "https://tieba.baidu.com/p/5524106374?red_tag=3108778011"
	//这里使用http.NewRequest
	//func NewRequest(method, url string, body io.Reader) (*Request, error)
	//请求头等乱七八糟的全加好了，但注意此时还没有真正去爬爬取，我们请求头还没加呢？
	res, err := http.NewRequest("GET", url,nil)
	if err!=nil{
		fmt.Println("err=",err)
		return
	}
	//使用NewRequest获得的res便可以添加cookies请求头之类的了
	//接收两个string，分别是key和value
	res.Header.Add("cookies","sessionid=5edb1f18c5a0cb334b42b2383c899e01")
	res.Header.Add("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36")
	//请求头等乱七八糟的全加好了，怎么让它去爬呢？
	client := &http.Client{}
	//直接Do一下就好了，表示让res去工作
	//func (c *Client) Do(req *Request) (*Response, error)
	//可以看到接收的是一个Request的指针类型，返回一个*Response和一个error
	//和http.Get的返回值一样
	res1, err := client.Do(res)
	//仍然使用ioutil.ReadAll读取
	body,_ := ioutil.ReadAll(res1.Body)
	fmt.Println(string(body))

	//同理post也是一样
	//这里body相当于requests.post里的params
	res, _ = http.NewRequest("GET", url, strings.NewReader("username=satori&password=123"))
	//注意这一句必须加上，否则post参数无法正常传递
	res.Header.Add("Content-Type", "application/x-www-form-urlencoded")
	//剩下的和get请求一样
}

实战演练

我们就将页面的所有图片爬取下来。

package main

import (
	"io/ioutil"
	"net/http"
	"regexp"
	"strconv"
	"sync"
)

func get_pic_url(url string, target_length int)[]string{
	req,_ := http.NewRequest("GET", url, nil)
	req.Header.Add("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36")
	client := &http.Client{}
	res,_ := client.Do(req)
	//得到整个html页面,当然这里还是字节
	html,_:= ioutil.ReadAll(res.Body)
	//构建正则，用于匹配图片的url
	compile,_ := regexp.Compile(`<img.+?src="(.+?jpg)`)
	//查找匹配的图片url
	all_pic_url := compile.FindAllStringSubmatch(string(html), -1)
	//go的正则不好的一点就是，我们只想匹配括号里面的内容，但是它把整体也匹配出来了
	//也就是整体和括号里面的一起放到一个切片里面，需要我们通过索引去取
	//所以这里必须使用FindAllStringSubmatch，如果FindAllString,即便加了括号也只能匹配整体
	pic_url_array := []string{}  //存储图片url的切片
	for _,v:=range all_pic_url{
		//但不是所有url我们都要，有些是用户上传的表情，因为我们需要进行筛选
		//target_length就是我们想要的图片的url的长度，只有长度和target_length相等的url我们才要
		if len(v[1]) == target_length{
			pic_url_array = append(pic_url_array, v[1])
		}
	}
	return pic_url_array
}

//连接有了，下面我们下载图片
func download(pic_url_array []string){
	var wg sync.WaitGroup
	wg.Add(len(pic_url_array))
	for index,url := range pic_url_array{
		//这里并发下载
		go func(index int, url string) {
			req,_ := http.NewRequest("GET", url, nil)
			req.Header.Add("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36")
			client := &http.Client{}
			res,_ := client.Do(req)
			//由于这里获取的是字节数组，正好我们也不需要转了，直接写入二进制b
			content,_ := ioutil.ReadAll(res.Body)
			ioutil.WriteFile(`C:\Users\Administrator\go\src\awesomeProject\pic\`+strconv.Itoa(index)+".jpg",content,0755)
			wg.Done()
		}(index, url)
	}
	wg.Wait()
}
func main(){
	target_length := len("https://imgsa.baidu.com/forum/w%3D580/sign=50b8de14898ba61edfeec827713597cc/af59f91b0ef41bd54ce3302a5ada81cb38db3d00.jpg")
	pic_url_array :=get_pic_url("https://tieba.baidu.com/p/5524106374?red_tag=3108778011", target_length)
	download(pic_url_array)
}

巴特西

golang之http

go获取html页面

go进行post请求

实战演练

最新文章

热门文章