mergepdf2html.py 981 B

123456789101112131415161718192021222324252627
  1. #-*-coding:utf-8 -*-
  2. import os
  3. htmlstr = "<html><head></head><body>"
  4. def pdf2html(srcpdf,outdir="C:\\AppData\\say365\\htmp"):
  5. """
  6. """
  7. if not os.path.exists(outdir):
  8. os.makedirs(outdir)
  9. outsvg = os.path.join(outdir,srcpdf.split("\\")[-1]).replace(".pdf",".svg")
  10. tmphtml = os.path.join(outdir,srcpdf.split("\\")[-1]).replace(".pdf",".html")
  11. cmd = "E:\\WorkSpace\\docxconvert\\say365parser\pdf2vec.exe -useunicode 1 %s %s -winfont2" % (srcpdf,outsvg)
  12. os.system(cmd)
  13. with open(tmphtml,"w+") as f:
  14. #f.write(htmlstr)
  15. for name in os.listdir(outdir):
  16. tmpsvg = os.path.join(outdir,name)
  17. content = open(tmpsvg,"r").read()
  18. f.write(content)
  19. #f.write("</body></html>")
  20. return tmphtml
  21. if __name__ == "__main__":
  22. srcpdf = "E:\\WorkSpace\\docxconvert\\ecb6c73b-78dc-4b66-83a0-ebf3803645b8.pdf"
  23. outdir = "C:\\AppData\\say365\\htmp"
  24. pdf2html(srcpdf,outdir)