|
@@ -0,0 +1,112 @@
|
|
|
+#-*-coding:utf-8 -*-
|
|
|
+import re,os
|
|
|
+import json,uuid
|
|
|
+from bs4 import BeautifulSoup
|
|
|
+from win32com.client import Dispatch
|
|
|
+
|
|
|
+class DocxConverter(object):
|
|
|
+ """
|
|
|
+ """
|
|
|
+ def __init__(self,docpath=None):
|
|
|
+ """
|
|
|
+ """
|
|
|
+ self.docpath = docpath
|
|
|
+ self.word = Dispatch("Word.Application")
|
|
|
+ self.word.Visible = 0
|
|
|
+ self.doc = word.Documents.Open(self.docpath)
|
|
|
+
|
|
|
+ def docx2html(self):
|
|
|
+ """
|
|
|
+ """
|
|
|
+ html = os.path.join(os.path.dirname(self.docpath),str(uuid.uuid4())+".html")
|
|
|
+ self.doc.SaveAs(html)
|
|
|
+ self.doc.Close()
|
|
|
+ self.word.Quit()
|
|
|
+ return html
|
|
|
+
|
|
|
+class QuestionsParser(object):
|
|
|
+ """试题解析
|
|
|
+ """
|
|
|
+ def __init__(self,name="test4.html"):
|
|
|
+ self.html = open(name,"r").read()
|
|
|
+ self.soup = BeautifulSoup(self.html,"html.parser")
|
|
|
+
|
|
|
+ def get_paragraphs(self):
|
|
|
+ """
|
|
|
+ """
|
|
|
+ wordsection = self.soup.find("div",class_="WordSection1")
|
|
|
+ #print wordsection
|
|
|
+ pars = wordsection.find_all("p")
|
|
|
+ return pars
|
|
|
+
|
|
|
+ def parse_questions(self):
|
|
|
+ """提取试题
|
|
|
+ """
|
|
|
+ que_type_dct = {}
|
|
|
+ paragraphs = self.get_paragraphs()
|
|
|
+ for i,p in enumerate(paragraphs):
|
|
|
+ print p.text
|
|
|
+ if u"【题型】" in p.text:
|
|
|
+ que_type_dct["type"] = p.text.split("、")[-1]
|
|
|
+
|
|
|
+ def parse_questions(self):
|
|
|
+ """提取试题
|
|
|
+ """
|
|
|
+ data = []
|
|
|
+ tmp_val = {}
|
|
|
+ tx_name = ""
|
|
|
+ key = ""
|
|
|
+ paragraphs = self.get_paragraphs()
|
|
|
+ for i,p in enumerate(paragraphs):
|
|
|
+ if u"【题型】" in p.text:
|
|
|
+ tx_name = p.text
|
|
|
+ if u"【题干】" in p.text:
|
|
|
+ key = "tg"
|
|
|
+ tmp_val["tx"] = tx_name
|
|
|
+ if tmp_val.get("tg"):
|
|
|
+ data.append(tmp_val)
|
|
|
+ tmp_val = {"tg":"","tx":"","zsd":"","nd":"","da":"","jx":""}
|
|
|
+ if u"【知识点】" in p.text:
|
|
|
+ key = "zsd"
|
|
|
+ if u"【难度】" in p.text:
|
|
|
+ key = "nd"
|
|
|
+ if u"【答案】" in p.text:
|
|
|
+ key = "da"
|
|
|
+ if u"【解析】" in p.text:
|
|
|
+ key = "jx"
|
|
|
+
|
|
|
+ if key != "":
|
|
|
+ tmp_val[key] += p.__str__()
|
|
|
+
|
|
|
+ data.append(tmp_val)
|
|
|
+
|
|
|
+ return data
|
|
|
+
|
|
|
+ def get_questions(self):
|
|
|
+ """
|
|
|
+ """
|
|
|
+ questions = self.parse_questions()
|
|
|
+ for que in questions:
|
|
|
+ que["tx"] = que["tx"].split(u"、")[-1]
|
|
|
+ #que["tg"] = que["tg"].replace(u"【题干】","")
|
|
|
+ #que["zsd"] = que["zsd"].replace(u"【知识点】","")
|
|
|
+ #que["da"] = que["da"].replace(u"【答案】","")
|
|
|
+ #que["jx"] = que["jx"].replace(u"【解析】","")
|
|
|
+ que["qno"] = self.get_qno(que["tg"])
|
|
|
+ return questions
|
|
|
+
|
|
|
+ def get_qno(self,tg):
|
|
|
+ """提取题号
|
|
|
+ """
|
|
|
+ tgsoup = BeautifulSoup(tg,"html.parser")
|
|
|
+ tgtext = tgsoup.text
|
|
|
+ qno = re.search(r"\d+",tgtext.split(u"、")[0]).group()
|
|
|
+ return qno
|
|
|
+
|
|
|
+
|
|
|
+questionparser = QuestionsParser()
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ ques = questionparser.get_questions()
|
|
|
+ with open("t.json","w+") as f:
|
|
|
+ f.write(json.dumps(ques))
|