文件名处理:

#文件名字中含有特殊字符转成空格,因为?‘’等作为文件名是非法的。以下正则表达式进行过滤转换
filename = re.sub("[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]",' ', name)

头部协议格式化:

chrome等浏览器复制出来的header请求头不能直接使用,可以用这段代码转字典或者json。

import re
import json
headers_str = '''
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
Accept-Encoding: gzip, deflate
Accept-Language: zh-CN,zh;q=0.9,en;q=0.8
Cache-Control: max-age=0
Connection: keep-alive
Cookie: wp-settings-1=hidetb
Host: www.hekaiyu.cn
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36
'''
def headersstr_headersjosn(headers_str):
    pattern = '^(.*?): (.*)$'
    header={}
    for line in headers_str.splitlines():
        item=re.findall(pattern, line)
        if len(item)!=0:
            header[item[0][0]]=item[0][1]
    header_json=json.dumps(header, sort_keys=True, indent=4)
    return header_json
header_json=headersstr_headersjosn(headers_str)
print(header_json)

读取文件:

with open('workfile') as f:
    #读取全部内容
    read_data = f.read()
    #前面说过readline方法只读取一行
    read_line=f.readline()
    # readlines方法则是读取所有行,返回的是所有行组成的列表。
    read_lines_list=f.readlines()

写入文件:

def downtext(content,name):
    with  open(name+'.txt','w') as file:
        file.write(str(content));
        file.close();

保存图片:

def downimg(path,img_url):
    with open("images/"+path+os.path.basename(am_source_url),'wb') as f:
        header={
            'Referer':'https://www.adbug.cn/adSearch?keyword=%E9%A1%BE%E8%BD%BB%E8%88%9F',
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
        }
        resp=requests.get(img_url,headers=header)
        f.write(resp.content)
    print(os.path.basename(am_source_url)+"保存成功")

过滤html代码:

##############################
#过滤HTML中的标签
#将HTML中标签等信息去掉
#@param htmlstr HTML字符串.
#https://cloud.tencent.com/developer/ask/28530
#doc=html_to_text(html)调用
##############################
from HTMLParser import HTMLParser, HTMLParseError
from htmlentitydefs import name2codepoint
import re
class _HTMLToText(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self._buf = []
        self.hide_output = False

    def handle_starttag(self, tag, attrs):
        if tag in ('p', 'br') and not self.hide_output:
            self._buf.append('\n')
        elif tag in ('script', 'style'):
            self.hide_output = True

    def handle_startendtag(self, tag, attrs):
        if tag == 'br':
            self._buf.append('\n')

    def handle_endtag(self, tag):
        if tag == 'p':
            self._buf.append('\n')
        elif tag in ('script', 'style'):
            self.hide_output = False

    def handle_data(self, text):
        if text and not self.hide_output:
            self._buf.append(re.sub(r'\s+', ' ', text))

    def handle_entityref(self, name):
        if name in name2codepoint and not self.hide_output:
            c = unichr(name2codepoint[name])
            self._buf.append(c)

    def handle_charref(self, name):
        if not self.hide_output:
            n = int(name[1:], 16) if name.startswith('x') else int(name)
            self._buf.append(unichr(n))

    def get_text(self):
        return re.sub(r' +', ' ', ''.join(self._buf))

def html_to_text(html):
    """
    Given a piece of HTML, return the plain text it contains.
    This handles entities and char refs, but not javascript and stylesheets.
    """
    parser = _HTMLToText()
    try:
        parser.feed(html)
        parser.close()
    except HTMLParseError:
        pass
    return parser.get_text()

def text_to_html(text):
    """
    Convert the given text to html, wrapping what looks like URLs with <a> tags,
    converting newlines to <br> tags and converting confusing chars into html
    entities.
    """
    def f(mo):
        t = mo.group()
        if len(t) == 1:
            return {'&':'&amp;', "'":'&#39;', '"':'&quot;', '<':'&lt;', '>':'&gt;'}.get(t)
        return '<a href="%s">%s</a>' % (t, t)
    return re.sub(r'https?://[^] ()"\';]+|[&\'"<>]', f, text)
    
#####################
#过滤HTML中的标签结束
#####################