parsedocx.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. #-*-coding:utf-8 -*-
  2. import bs4
  3. import re,os
  4. import json,uuid
  5. from bs4 import BeautifulSoup
  6. import threading
  7. import shutil
  8. import pprint
  9. class QuestionsParser(object):
  10. """试题解析
  11. """
  12. def __init__(self,name="lasttest001.html"):
  13. self.html = open(name,"r").read()
  14. self.soup = BeautifulSoup(self.html,"html.parser")
  15. def get_paragraphs(self):
  16. """
  17. """
  18. data = []
  19. wordsection = self.soup.find_all("g",attrs={"xml:space":"preserve"})
  20. for svg in wordsection:
  21. pars = svg.find_all("g")
  22. for par in pars:
  23. if isinstance(par,bs4.element.Tag):
  24. if par.find("path") and len(list(par.children))<=2:
  25. data.append(par)
  26. else:
  27. data.extend(list(par.children))
  28. return data
  29. def parse_questions(self):
  30. """提取试题
  31. """
  32. data = []
  33. tmp_val = {"tx":""}
  34. tx_name = ""
  35. key = ""
  36. paragraphs = self.get_paragraphs()
  37. label = ""
  38. tx_label = ""
  39. tg_label = ""
  40. zsd_label = ""
  41. nd_label = ""
  42. da_label = ""
  43. jx_label = ""
  44. for i,p in enumerate(paragraphs):
  45. if not isinstance(p,bs4.element.NavigableString):
  46. #print(label)
  47. if u"【题型】" in label:
  48. key = "tx"
  49. if u"【题干】" == label:
  50. key = "tg"
  51. if tmp_val.get("tg") and tmp_val.get("da"):
  52. data.append(tmp_val)
  53. tmp_val = {"tg":"","tx":"","zsd":"","nd":"","da":"","jx":""}
  54. if u"【知识点】" in label:
  55. key = "zsd"
  56. if u"【难度】" in label:
  57. key = "nd"
  58. if u"【答案】" in label:
  59. key = "da"
  60. if u"【解析】" in label:
  61. key = "jx"
  62. if key != "":
  63. if u"【" not in p.text:
  64. if key == "tx" and u"【题型】" in label:
  65. tmp_val[key] += p.text.replace("\n","")
  66. if key == "tg" and u"【题干】" in label:
  67. tmp_val[key] += p.__str__()
  68. if key == "zsd" and u"【知识点】" in label:
  69. tmp_val[key] += p.__str__()
  70. if key == "nd" and u"【难度】" in label:
  71. tmp_val[key] += p.__str__()
  72. if key == "da" and u"【答案】" in label:
  73. tmp_val[key] += p.__str__()
  74. if key == "jx" and u"【解析】" in label:
  75. tmp_val[key] += p.__str__()
  76. if u"【" in p.text:
  77. label = ""
  78. label += p.text.replace("\n","")
  79. elif u"】" in p.text:
  80. label += p.text.replace("\n","")
  81. else:
  82. label += p.text.replace("\n","")
  83. data.append(tmp_val)
  84. return data
  85. def get_questions(self):
  86. """
  87. """
  88. svgstart = '<svg version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:svg="http://www.w3.org/2000/svg" width="100%" height="180px" viewBox="40 180 900 90" xml:space="preserve"><g xml:space="preserve">'
  89. svgend = '</g></svg>'
  90. questions = self.parse_questions()
  91. for i,que in enumerate(questions):
  92. if que:
  93. que['tg'] = svgstart + que['tg'] + svgend
  94. que['jx'] = svgstart + que['jx'] + svgend
  95. que['zsd'] = svgstart + que['zsd'] + svgend
  96. que['da'] = svgstart + que['da'] + svgend
  97. que['nd'] = svgstart + que['da'] + svgend
  98. que['qno'] = str(i)
  99. #for que in questions:
  100. # if que:
  101. # #que["tx"] = que["tx"].split(u"、")[-1]
  102. # #que["tg"] = que["tg"].replace(u"【题干】","")
  103. # #que["zsd"] = que["zsd"].replace(u"【知识点】","")
  104. # #que["da"] = que["da"].replace(u"【答案】","")
  105. # #que["jx"] = que["jx"].replace(u"【解析】","")
  106. # #que["qno"] = self.get_qno(que["tg"])
  107. return questions
  108. def get_qno(self,tg):
  109. """提取题号
  110. """
  111. tgsoup = BeautifulSoup(tg,"html.parser")
  112. tgtext = tgsoup.text
  113. qno = re.search(r"\d+",tgtext.split(u"、")[0]).group()
  114. return qno
  115. if __name__ == "__main__":
  116. questionparser = QuestionsParser("C:\\AppData\\say365\\htmp\\ecb6c73b-78dc-4b66-83a0-ebf3803645b8.html")
  117. ques = questionparser.get_questions()
  118. for que in ques:
  119. if que:
  120. print("*"*100)
  121. print(que["tg"])
  122. pass
  123. #break