123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137 |
- #-*-coding:utf-8 -*-
- import bs4
- import re,os
- import json,uuid
- from bs4 import BeautifulSoup
- import threading
- import shutil
- import pprint
- class QuestionsParser(object):
- """试题解析
- """
- def __init__(self,name="lasttest001.html"):
- self.html = open(name,"r").read()
- self.soup = BeautifulSoup(self.html,"html.parser")
- def get_paragraphs(self):
- """
- """
- data = []
- wordsection = self.soup.find_all("g",attrs={"xml:space":"preserve"})
- for svg in wordsection:
- pars = svg.find_all("g")
- for par in pars:
- if isinstance(par,bs4.element.Tag):
- if par.find("path") and len(list(par.children))<=2:
- data.append(par)
- else:
- data.extend(list(par.children))
- return data
- def parse_questions(self):
- """提取试题
- """
- data = []
- tmp_val = {"tx":""}
- tx_name = ""
- key = ""
- paragraphs = self.get_paragraphs()
- label = ""
- tx_label = ""
- tg_label = ""
- zsd_label = ""
- nd_label = ""
- da_label = ""
- jx_label = ""
- for i,p in enumerate(paragraphs):
- if not isinstance(p,bs4.element.NavigableString):
- #print(label)
- if u"【题型】" in label:
- key = "tx"
- if u"【题干】" == label:
- key = "tg"
- if tmp_val.get("tg") and tmp_val.get("da"):
- data.append(tmp_val)
- tmp_val = {"tg":"","tx":"","zsd":"","nd":"","da":"","jx":""}
- if u"【知识点】" in label:
- key = "zsd"
- if u"【难度】" in label:
- key = "nd"
- if u"【答案】" in label:
- key = "da"
- if u"【解析】" in label:
- key = "jx"
- if key != "":
- if u"【" not in p.text:
- if key == "tx" and u"【题型】" in label:
- tmp_val[key] += p.text.replace("\n","")
- if key == "tg" and u"【题干】" in label:
- tmp_val[key] += p.__str__()
- if key == "zsd" and u"【知识点】" in label:
- tmp_val[key] += p.__str__()
- if key == "nd" and u"【难度】" in label:
- tmp_val[key] += p.__str__()
- if key == "da" and u"【答案】" in label:
- tmp_val[key] += p.__str__()
- if key == "jx" and u"【解析】" in label:
- tmp_val[key] += p.__str__()
- if u"【" in p.text:
- label = ""
- label += p.text.replace("\n","")
- elif u"】" in p.text:
- label += p.text.replace("\n","")
- else:
- label += p.text.replace("\n","")
- data.append(tmp_val)
- return data
- def get_questions(self):
- """
- """
- svgstart = '<svg version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:svg="http://www.w3.org/2000/svg" width="100%" height="180px" viewBox="40 180 900 90" xml:space="preserve"><g xml:space="preserve">'
- svgend = '</g></svg>'
- questions = self.parse_questions()
- for i,que in enumerate(questions):
- if que:
- que['tg'] = svgstart + que['tg'] + svgend
- que['jx'] = svgstart + que['jx'] + svgend
- que['zsd'] = svgstart + que['zsd'] + svgend
- que['da'] = svgstart + que['da'] + svgend
- que['nd'] = svgstart + que['da'] + svgend
- que['qno'] = str(i)
- #for que in questions:
- # if que:
- # #que["tx"] = que["tx"].split(u"、")[-1]
- # #que["tg"] = que["tg"].replace(u"【题干】","")
- # #que["zsd"] = que["zsd"].replace(u"【知识点】","")
- # #que["da"] = que["da"].replace(u"【答案】","")
- # #que["jx"] = que["jx"].replace(u"【解析】","")
- # #que["qno"] = self.get_qno(que["tg"])
- return questions
- def get_qno(self,tg):
- """提取题号
- """
- tgsoup = BeautifulSoup(tg,"html.parser")
- tgtext = tgsoup.text
- qno = re.search(r"\d+",tgtext.split(u"、")[0]).group()
- return qno
-
- if __name__ == "__main__":
- questionparser = QuestionsParser("C:\\AppData\\say365\\htmp\\ecb6c73b-78dc-4b66-83a0-ebf3803645b8.html")
- ques = questionparser.get_questions()
- for que in ques:
- if que:
- print("*"*100)
- print(que["tg"])
- pass
- #break
|