红烧鸡,nlp,肯德基全家桶-彩条布染坊,专业制造染料和布匹

admin 5个月前 ( 05-22 03:36 ) 0条评论
摘要: #根据标签名、id、class、属性等查找标签###根据class、id、以及属性alog-action的值和标签类别查询soup.find)###查询标签内某属性的值pub...

今日小编就为我们共享一篇关于Python常用爬虫代码总结便利查询,小编觉得内容挺不错的,现在共享给我们,具有很好的参考价值,需求的朋友一同跟唐溢ty个人材料随小编来看看吧

beautifulsoup解析页面

from bs4 import BeautifulSoup
soup = BeautifulSoup(htmltxt, "lxml")
# 三种装载器
soup = BeautifulSoup("", "html.parser")
### 只要开始标签的会主动补全,只要完毕标签的会主动疏忽
### 成果为:

soup = BeautifulSoup("
", "lxml")
### 成果为:

soup = BeautifulSoup("
", "html5lib")
### html5lib则呈现一般的标签都会主动补全
### 成果为:


# 依据标签名、id、class、特点等查找标签
### 依据class、id偷喝妈妈的尿、以及特点alog-action的值和标签类别查询
soup.find("a",class_="title",id="t1",attrs={"alog-action": "qb-ask-uname"}))
### 查询标签内某特点的值
pubtime = soup.find("meta",attrs={"itemprop":"datePublished"}).attrs['content']
### 获取一切class为title的标签
for i in soup.find_all(class_="title"):
print(i.get_text())
### 获取特定数量的class为title的标签
for i in soup.find_all(class_="title",limit = 2):
print(i.get_tex红烧鸡,nlp,肯德基全家桶-彩条布染坊,专业制作染料和布疋t())
### 获取文本内容时能够指定不同标签之间的分隔符,也能够挑选是否去掉前后的空白。
soup = BeautifulSoup('

The Dormouses story

The Dormouses story

', "html5lib")
soup.find(class_="title").get_text("|", strip=True)
#成果为:The Dormouses story|The Dormouses story
### 获取class为title的p标签的id
soup.find(class_红烧鸡,nlp,肯德基全家桶-彩条布染坊,专业制作染料和布疋="title").get("id")
### 对class称号正则:
soup.find_all(class_=re.compile("tit"))
### recursive参数,recursive=False时,只find当时标签的榜首级子标签的数据
soup = BeautifulSoup('abc','lxml')<br>soup.html.find_all("title", recursive=False)<br>

unicode编码转中文

content = "\\u65f6\\u75c7\\u5b85"
content = content.encode("utf8","ignore").decode('unicode_escape')

url encode的解码与解码

from urllib import parse
# 编码
x = "我国你好"
y = parse.quote(x)
print(y)
# 解码
x = parse.unquote(y)
print(x)

html转义字符的解码

from html.parser import HTMLParser
htmls = "

"
txt = HTMLParser().unescape(htmls)
print(txt) . # 输出


base64的编码与解码

import base64
# 编码
content = "测验转码文本123"
contents_base64 = base64.b64encode(content.encode('utf-8','ignore')).decode("utf-8")
# 解码
contents = base64.b64decode(contents_base64)

过滤emoji表情

def filter_emoji(desstr,restr=''):
try:
co = re.compile(u'[\U00010000-\U0010ffff]')
except re.error:
co = re.compile(u'[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]')
return co.sub(restr, desstr)

彻底过滤script和style标签

import requests
from bs4 import BeautifulSoup
soup = BeautifulSoup(htmls, "lxml")
for script in s叶七七oup(["script", "style"])水晶钢琴音乐盒多少钱:
script.extract()
print(soup)

过滤html的标签,但保存标签里的内容

import re
htmls = "

abc

"
dr = re.compile(r'<[^>]+>',re.S)
htmls2 = dr.sub('',htmls)
print(htmls2) #abc
正则提取内容(一般处理json)
rollback({
"response": {
"code": "0",
"msg": "Success",
"de北部湾五大优惠政策xt": ""
},
"data": {
"count": 3,
"page": 1,
"article_info": [{
"title": "“小库里”:习惯竞赛是首要任务 投篮终会找到节奏",
"url": "http:\/\/sports.qq.com\/a\/20180704\/035378.htm",
"time": "2018-07-04 16:58myavsuper:36",
"column": "NBA",
"im刀锋洗眼洗出白虫子g": "",
"desc": ""
}, {
"title": "首钢体育助力国家冰球集训队 我国冰球联赛年末发动",
"url": "http:\/\/sports.qq.com\/a\/20180704\/034698.htm"女诗人邀观众摸胸,
"time": "2018-07-04 16:34:44",鬼齿龙蝰
"column": "归纳体育",
"img": "",
"desc": ""
}...]
}
})
import re
# 提取这个json中的每条新闻的title、url
# (.*?)为要提取的内容,能够在正则字符串中参加红烧鸡,nlp,肯德基全家桶-彩条布染坊,专业制作染料和布疋.*?表明中心省掉若干字符
reg_str = r'"title":"(.*?)",.*?"url":"(.*?)"'
pattern = re.compile(reg_str,re.DOTALL)
items = re.findall(pattern,htmls)
for i in items:
tilte = i[0]
url = i[1]

时刻操作

# 获取当时日期
today = datetime.date.today()
print(today) #2018-07-05
# 获取当时时刻并格式化
time_now = time.strftime("%Y-%m-%d %H:%M红烧鸡,nlp,肯德基全家桶-彩条布染坊,专业制作染料和布疋:%S",time.localtime(time.time()))
print(time_now) #2018-07-05 太平洋英豪2攻略红烧鸡,nlp,肯德基全家桶-彩条布染坊,专业制作染料和布疋14:20:55
# 对时刻戳格式化
a = 1502691655
time_a = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(a)))
print(time_a) #2017-08-14 14:20:55
# 字符串转为datetime类型
str = "2018-07-01 00:00:00"
datetime.datetime.strptime(st, "%Y-%m-%d %H:%M:%S")
# 将时刻红烧鸡,nlp,肯德基全家桶-彩条布染坊,专业制作染料和布疋转化为时刻戳
time_line = "2018-07-16 10:38:50"
time_tuple骚女性 = time.strptime(time_line, "%Y-%m-%d %H:%M:%S")
ti红烧鸡,nlp,肯德基全家桶-彩条布染坊,专业制作染料和布疋me_line2 = int(time.mktime(time_tuple))
# 明日的日期
today = datetime.date.today()
tomorrow = today + datetime.timedelta(days=1)
print(tomorrow) #2018-07-06
# 三天前的时刻
today = datetime.datetime.today()
tomorrow = today + datetime.timedelta(days=-3)
print(tomorrow) #2018-07-02 13:37:00.107703
# 核算时刻差
start = "2018古龙之陨-07-03 00:00:00"
time_now = 祁大鹏新浪博客datetime.datetime.now()
b = datetime.datetime.strptime(start,'%Y-%m-%d %H:%M:%S')
minutes = (time_now-b).seconds/60
days = (time_now-b).days
all_minutes = days*24*60+minutes
print(minutes) #821.7666666666667
print(days) #2
print(all_minutes) #3701.7666666666664

数据库操作

import pymysql
conn = pymysql.connect(host='10.0.8.81', port=3306, user='root'土灰蛇, passwd='root',db='xxx', charset='utf8')
cur = conn.cursor()
insert_sql = "insert into tbl_name(id,name,age) values(%s,%s,%s)
id = 1
name = "like"
age = 26
data_list = []
data = (id,name,age)
# 单条刺进
cur.execute(insert_sql,data)
conn.commit()
# 批量刺进
data_list.append(data)
cur.executemany(insert_sql,data_list)
conn.commit()
#特别字符处理肯定丽奴(name中含有特别字符)
data = (id,pymysql.escape_string(name),age)
#更新
update_sql = "update tbl_name set content = '%s' where id = "+str(id)
cur.execut乳妈e(update_sql%(pymysql.escape_string(content)))
conn.commit()
#批量更新
update_sql = "UPDATE tbl_recieve SET content = %s ,title = %s , is_spider = %s WHERE i孙历生d = %s"
update_da珍珠茧ta = (contents,title,is_spider,one_new[0])
update_data_list.append(update_data)
if len(update_data_list) > 500:
try:
cur.executemany(update_sql,update_data_list)
conn.commit()

以上便是小小婷的假期编今日为我们总结的一些Python常用的爬虫代码。

一起小编将19年最新的python学习材料共享给我们

学习材料收取方法:转发+重视后私信小编“材料”即可收取

文章版权及转载声明:

作者:admin本文地址:http://www.cbte.com.cn/articles/1280.html发布于 5个月前 ( 05-22 03:36 )
文章转载或复制请以超链接形式并注明出处彩条布染坊,专业制造染料和布匹