#-*-coding:utf-8 -*- import bs4 import re,os import json,uuid from bs4 import BeautifulSoup import threading import shutil import pprint class QuestionsParser(object): """试题解析 """ def __init__(self,name="lasttest001.html"): self.html = open(name,"r").read() self.soup = BeautifulSoup(self.html,"html.parser") def get_paragraphs(self): """ """ data = [] wordsection = self.soup.find_all("g",attrs={"xml:space":"preserve"}) for svg in wordsection: pars = svg.find_all("g") for par in pars: if isinstance(par,bs4.element.Tag): if par.find("path") and len(list(par.children))<=2: data.append(par) else: data.extend(list(par.children)) return data def parse_questions(self): """提取试题 """ data = [] tmp_val = {"tx":""} tx_name = "" key = "" paragraphs = self.get_paragraphs() label = "" tx_label = "" tg_label = "" zsd_label = "" nd_label = "" da_label = "" jx_label = "" for i,p in enumerate(paragraphs): if not isinstance(p,bs4.element.NavigableString): #print(label) if u"【题型】" in label: key = "tx" if u"【题干】" == label: key = "tg" if tmp_val.get("tg") and tmp_val.get("da"): data.append(tmp_val) tmp_val = {"tg":"","tx":"","zsd":"","nd":"","da":"","jx":""} if u"【知识点】" in label: key = "zsd" if u"【难度】" in label: key = "nd" if u"【答案】" in label: key = "da" if u"【解析】" in label: key = "jx" if key != "": if u"【" not in p.text: if key == "tx" and u"【题型】" in label: tmp_val[key] += p.text.replace("\n","") if key == "tg" and u"【题干】" in label: tmp_val[key] += p.__str__() if key == "zsd" and u"【知识点】" in label: tmp_val[key] += p.__str__() if key == "nd" and u"【难度】" in label: tmp_val[key] += p.__str__() if key == "da" and u"【答案】" in label: tmp_val[key] += p.__str__() if key == "jx" and u"【解析】" in label: tmp_val[key] += p.__str__() if u"【" in p.text: label = "" label += p.text.replace("\n","") elif u"】" in p.text: label += p.text.replace("\n","") else: label += p.text.replace("\n","") data.append(tmp_val) return data def get_questions(self): """ """ svgstart = '' svgend = '' questions = self.parse_questions() for i,que in enumerate(questions): if que: que['tg'] = svgstart + que['tg'] + svgend que['jx'] = svgstart + que['jx'] + svgend que['zsd'] = svgstart + que['zsd'] + svgend que['da'] = svgstart + que['da'] + svgend que['nd'] = svgstart + que['da'] + svgend que['qno'] = str(i) #for que in questions: # if que: # #que["tx"] = que["tx"].split(u"、")[-1] # #que["tg"] = que["tg"].replace(u"【题干】","") # #que["zsd"] = que["zsd"].replace(u"【知识点】","") # #que["da"] = que["da"].replace(u"【答案】","") # #que["jx"] = que["jx"].replace(u"【解析】","") # #que["qno"] = self.get_qno(que["tg"]) return questions def get_qno(self,tg): """提取题号 """ tgsoup = BeautifulSoup(tg,"html.parser") tgtext = tgsoup.text qno = re.search(r"\d+",tgtext.split(u"、")[0]).group() return qno if __name__ == "__main__": questionparser = QuestionsParser("C:\\AppData\\say365\\htmp\\ecb6c73b-78dc-4b66-83a0-ebf3803645b8.html") ques = questionparser.get_questions() for que in ques: if que: print("*"*100) print(que["tg"]) pass #break