文件名处理:
#文件名字中含有特殊字符转成空格,因为?‘’等作为文件名是非法的。以下正则表达式进行过滤转换
filename = re.sub("[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]",' ', name)
头部协议格式化:
chrome等浏览器复制出来的header请求头不能直接使用,可以用这段代码转字典或者json。
import re
import json
headers_str = '''
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
Accept-Encoding: gzip, deflate
Accept-Language: zh-CN,zh;q=0.9,en;q=0.8
Cache-Control: max-age=0
Connection: keep-alive
Cookie: wp-settings-1=hidetb
Host: www.hekaiyu.cn
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36
'''
def headersstr_headersjosn(headers_str):
pattern = '^(.*?): (.*)$'
header={}
for line in headers_str.splitlines():
item=re.findall(pattern, line)
if len(item)!=0:
header[item[0][0]]=item[0][1]
header_json=json.dumps(header, sort_keys=True, indent=4)
return header_json
header_json=headersstr_headersjosn(headers_str)
print(header_json)
读取文件:
with open('workfile') as f:
#读取全部内容
read_data = f.read()
#前面说过readline方法只读取一行
read_line=f.readline()
# readlines方法则是读取所有行,返回的是所有行组成的列表。
read_lines_list=f.readlines()
写入文件:
def downtext(content,name):
with open(name+'.txt','w') as file:
file.write(str(content));
file.close();
保存图片:
def downimg(path,img_url):
with open("images/"+path+os.path.basename(am_source_url),'wb') as f:
header={
'Referer':'https://www.adbug.cn/adSearch?keyword=%E9%A1%BE%E8%BD%BB%E8%88%9F',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
resp=requests.get(img_url,headers=header)
f.write(resp.content)
print(os.path.basename(am_source_url)+"保存成功")
过滤html代码:
##############################
#过滤HTML中的标签
#将HTML中标签等信息去掉
#@param htmlstr HTML字符串.
#https://cloud.tencent.com/developer/ask/28530
#doc=html_to_text(html)调用
##############################
from HTMLParser import HTMLParser, HTMLParseError
from htmlentitydefs import name2codepoint
import re
class _HTMLToText(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self._buf = []
self.hide_output = False
def handle_starttag(self, tag, attrs):
if tag in ('p', 'br') and not self.hide_output:
self._buf.append('\n')
elif tag in ('script', 'style'):
self.hide_output = True
def handle_startendtag(self, tag, attrs):
if tag == 'br':
self._buf.append('\n')
def handle_endtag(self, tag):
if tag == 'p':
self._buf.append('\n')
elif tag in ('script', 'style'):
self.hide_output = False
def handle_data(self, text):
if text and not self.hide_output:
self._buf.append(re.sub(r'\s+', ' ', text))
def handle_entityref(self, name):
if name in name2codepoint and not self.hide_output:
c = unichr(name2codepoint[name])
self._buf.append(c)
def handle_charref(self, name):
if not self.hide_output:
n = int(name[1:], 16) if name.startswith('x') else int(name)
self._buf.append(unichr(n))
def get_text(self):
return re.sub(r' +', ' ', ''.join(self._buf))
def html_to_text(html):
"""
Given a piece of HTML, return the plain text it contains.
This handles entities and char refs, but not javascript and stylesheets.
"""
parser = _HTMLToText()
try:
parser.feed(html)
parser.close()
except HTMLParseError:
pass
return parser.get_text()
def text_to_html(text):
"""
Convert the given text to html, wrapping what looks like URLs with <a> tags,
converting newlines to <br> tags and converting confusing chars into html
entities.
"""
def f(mo):
t = mo.group()
if len(t) == 1:
return {'&':'&', "'":''', '"':'"', '<':'<', '>':'>'}.get(t)
return '<a href="%s">%s</a>' % (t, t)
return re.sub(r'https?://[^] ()"\';]+|[&\'"<>]', f, text)
#####################
#过滤HTML中的标签结束
#####################