获取html的路径、word的模版路径,html包含图片路径前缀
public class Html2Word { public static void main(String[] args) throws Exception { String html2WordTemplatePath = "D:\\test\\test\\html2word\\html to word template (1).docx"; Document htmlDocument = Jsoup.parse(new File("D:\\test\\test\\html2word\\test-1.htm")); NiceXWPFDocument document = new NiceXWPFDocument(Files.newInputStream(Paths.get(html2WordTemplatePath))); boolean[] returnFlag = {false}; String filePathPre = "D:\\test\\test\\html2word\\"; List<Node> nodes = htmlDocument.body().childNodes(); for (Node node : nodes) { if (!(node instanceof Element)){ continue; } if (StringUtils.isEmpty(node.toString().trim())){ continue; } XWPFParagraph paragraph; if (node.nodeName().equalsIgnoreCase("table") || node.nodeName().equalsIgnoreCase("ul") || node.nodeName().equalsIgnoreCase("span")){ paragraph = document.getLastParagraph(); }else { paragraph = document.createParagraph(); } if (node.nodeName().equalsIgnoreCase("span")){ paragraph.setSpacingAfter(200); continue; } SarHtml2WordUtils.parseHtmlToWord(node,document,paragraph,returnFlag,filePathPre); if (returnFlag[0]){ break; } } document.getLastParagraph().createRun().addBreak(BreakType.PAGE); // 写入到输出流 String outPath = "D:\\test\\test\\html2word" + System.currentTimeMillis() + ".docx"; OutputStream outputStream1 = Files.newOutputStream(Paths.get(outPath)); document.write(outputStream1); outputStream1.close(); }}
读取html中的换行,颜色等信息,绘制到word中
public class Html2WordUtils { /** * 解析 html 格式内容 转变为 word * * @param node HTML的node 节点 * @param doc word 文档对象 * @param xwpfParagraph 段落 * @throws Exception 异常信息 */ public static void parseHtmlToWord(Node node, NiceXWPFDocument doc, XWPFParagraph xwpfParagraph,boolean[] returnFlag, String filePathPre) throws Exception { List<Node> nodes = node.childNodes(); if (CollectionUtils.isNotEmpty(nodes)) { for (Node childNode : nodes) { parseHtmlToWord(childNode, doc, xwpfParagraph,returnFlag,filePathPre); } } //处理table标签 if ("table".equalsIgnoreCase(node.nodeName())) { parseTableToWord(doc, node, xwpfParagraph,filePathPre); returnFlag[0] = true; return; } if (CollectionUtils.isNotEmpty(node.childNodes())) { return; } String nodeValue = node.toString(); Node parent = node.parent(); boolean boldFlag = false; String color = ""; boolean subFlag = false; boolean supFlag = false; boolean ulFlag = false; boolean tableFlag = false; if (null != parent) { String parentNodeName = parent.nodeName(); if (parentNodeName.equalsIgnoreCase("strong") || parentNodeName.equalsIgnoreCase("b")) { boldFlag = true; } else if (parentNodeName.equalsIgnoreCase("font")) { if (Objects.requireNonNull(parent.parent()).nodeName().equalsIgnoreCase("strong") || Objects.requireNonNull(parent.parent()).nodeName().equalsIgnoreCase("b")) { boldFlag = true; } String color1 = parent.attr("color"); if (StringUtils.isNotEmpty(color1)){ if (!Objects.equals("#ff0000",color1)){ return; } color = color1.substring(1); } Node parented = parent.parent(); if (null != parented){ if (parented.nodeName().equalsIgnoreCase("li")) { if (Objects.requireNonNull(parented.parent()).nodeName().equalsIgnoreCase("ul")) { ulFlag = true; } } } } else if (parentNodeName.equalsIgnoreCase("sub")) { subFlag = true; } else if (parentNodeName.equalsIgnoreCase("sup")) { supFlag = true; } else if (parentNodeName.equalsIgnoreCase("li")) { if (Objects.requireNonNull(parent.parent()).nodeName().equalsIgnoreCase("ul")) { ulFlag = true; } } else if (parentNodeName.equalsIgnoreCase("td")) { tableFlag = true; } } if (node.nodeName().equalsIgnoreCase("br")){ Node preNode = node.previousSibling(); if (null != preNode && null != preNode.parentNode()){ if (preNode.parentNode().nodeName().equalsIgnoreCase("font")) { String color1 = preNode.attr("color"); if (StringUtils.isNotEmpty(color1)){ if (!Objects.equals("#ff0000",color1)){ return; } } } } } if ("#text".equalsIgnoreCase(node.nodeName()) && !tableFlag && !nodeValue.contains("<")) { XWPFRun run = xwpfParagraph.createRun(); run.setFontFamily("Times New Roman"); run.setFontSize(10); if (boldFlag) { run.setBold(true); } if (StringUtils.isNotEmpty(color)) { run.setColor(color); } if (supFlag) { run.setSubscript(VerticalAlign.SUPERSCRIPT); } if (subFlag) { run.setSubscript(VerticalAlign.SUBSCRIPT); } if (ulFlag && StringUtils.isNotEmpty(nodeValue.trim())) { XWPFParagraph paragraph = doc.createParagraph(); paragraph.setIndentFromLeft(0); paragraph.setFirstLineIndent(0); paragraph.setIndentationLeftChars(125); XWPFRun run1 = paragraph.createRun(); run1.setFontFamily("宋体"); run1.setFontSize(8); run1.setText("● "); run1.addTab(); XWPFRun run2 = paragraph.createRun(); run2.setText(nodeValue.trim()); run2.setFontFamily("宋体"); run2.setFontSize(10); } if (StringUtils.isNotEmpty(nodeValue) && !ulFlag){ run.setText(nodeValue.trim()); } } boolean enabledBreak = ReUtil.isMatch("(h[12345]|li|img|br)", node.nodeName().toLowerCase()); if (enabledBreak) { XWPFRun run = xwpfParagraph.createRun(); run.addCarriageReturn(); } } private static void parseTableToWord(NiceXWPFDocument doc, Node node, XWPFParagraph paragraph,String filePathPre) throws Exception { //简化表格html String string = node.toString(); org.jsoup.nodes.Document tableDoc = Jsoup.parse(Objects.requireNonNull(simplifyTable(string))); Elements trList = tableDoc.getElementsByTag("tr"); // 获取页边距 BigInteger right = (BigInteger) doc.getDocument().getBody().getSectPr().getPgMar().getRight(); BigInteger left = (BigInteger) doc.getDocument().getBody().getSectPr().getPgMar().getLeft(); // word 工作区域范围宽度 double wordWorkAreaWidth = 21 - ((double) (right.intValue() + left.intValue()) / 567); //创建表格 XWPFTable xwpfTable = doc.insertNewTbl(paragraph.getCTP().newCursor()); if (null == xwpfTable) { return; } //设置样式 xwpfTable.setWidth("100%"); //写入表格行和列内容 for (int row = 0; row < trList.size(); row++) { XWPFTableRow tableRow = xwpfTable.getRow(row); if (null == tableRow){ tableRow = xwpfTable.createRow(); } Element trElement = trList.get(row); Elements tds = trElement.getElementsByTag("td"); double widthTotal = 0.0; for (int col = 0; col < tds.size(); col++) { Element colElement = tds.get(col); List<Node> nodes = colElement.childNodes(); for (Node tdNode : nodes) { if ("img".equalsIgnoreCase(tdNode.nodeName())) { String width = tdNode.attr("width"); if (NumberUtils.isNumeric(width.trim())){ widthTotal = widthTotal + Double.parseDouble(width.trim()); } } } } for (int col = 0; col < tds.size(); col++) { XWPFTableCell tableCell = tableRow.getCell(col); if (null == tableCell){ tableCell = tableRow.createCell(); } CTTcPr tcPr = tableCell.getCTTc().isSetTcPr() ? tableCell.getCTTc().getTcPr() : tableCell.getCTTc().addNewTcPr(); CTTcBorders ctTcBorders = tcPr.addNewTcBorders(); ctTcBorders.addNewLeft().setVal(STBorder.NIL); ctTcBorders.addNewRight().setVal(STBorder.NIL); ctTcBorders.addNewTop().setVal(STBorder.NIL); ctTcBorders.addNewBottom().setVal(STBorder.NIL); Element colElement = tds.get(col); List<Node> nodes = colElement.childNodes(); for (Node tdNode : nodes) { if ("img".equalsIgnoreCase(tdNode.nodeName())) { String src = tdNode.attr("src"); String width = tdNode.attr("width"); String height = tdNode.attr("height"); src = src.replaceAll("%20", " ").replaceAll("%26","&"); String picturePath = filePathPre + src; InputStream inputStream = Files.newInputStream(Paths.get(picturePath)); XWPFRun xwpfRun = tableCell.getParagraphs().get(0).createRun(); double picWidth = wordWorkAreaWidth * ( Double.parseDouble(width.trim()) / widthTotal); double picHeight = picWidth * Double.parseDouble(height.trim()) / Double.parseDouble(width.trim()); xwpfRun.addPicture(inputStream, Document.PICTURE_TYPE_PNG, src, (int) (picWidth * Units.EMU_PER_CENTIMETER), (int) (picHeight * Units.EMU_PER_CENTIMETER)); }else if ("#text".equalsIgnoreCase(tdNode.nodeName())){ parseHtmlToWordTable(colElement, doc, tableCell.getParagraphs().get(0)); } } } } } private static void parseHtmlToWordTable(Node node, NiceXWPFDocument doc, XWPFParagraph xwpfParagraph) { List<Node> nodes = node.childNodes(); if (CollectionUtils.isNotEmpty(nodes)) { for (Node childNode : nodes) { parseHtmlToWordTable(childNode, doc, xwpfParagraph); } } //处理table标签 if ("table".equalsIgnoreCase(node.nodeName())) { return; } if (CollectionUtils.isNotEmpty(node.childNodes())) { return; } String nodeValue = node.toString(); Node parent = node.parent(); boolean boldFlag = false; String color = ""; boolean subFlag = false; boolean supFlag = false; boolean ulFlag = false; if (null != parent) { String parentNodeName = parent.nodeName(); if (parentNodeName.equalsIgnoreCase("strong") || parentNodeName.equalsIgnoreCase("b")) { boldFlag = true; } else if (parentNodeName.equalsIgnoreCase("font")) { if (Objects.requireNonNull(parent.parent()).nodeName().equalsIgnoreCase("strong") || Objects.requireNonNull(parent.parent()).nodeName().equalsIgnoreCase("b")) { boldFlag = true; } String color1 = parent.attr("color"); if (StringUtils.isNotEmpty(color1)){ color = color1.substring(1); } Node parented = parent.parent(); if (null != parented){ if (parented.nodeName().equalsIgnoreCase("li")) { if (Objects.requireNonNull(parented.parent()).nodeName().equalsIgnoreCase("ul")) { ulFlag = true; } } } } else if (parentNodeName.equalsIgnoreCase("sub")) { subFlag = true; } else if (parentNodeName.equalsIgnoreCase("sup")) { supFlag = true; } else if (parentNodeName.equalsIgnoreCase("li")) { if (Objects.requireNonNull(parent.parent()).nodeName().equalsIgnoreCase("ul")) { ulFlag = true; } } } if ("#text".equalsIgnoreCase(node.nodeName()) && !nodeValue.contains("<")) { XWPFRun run = xwpfParagraph.createRun(); run.setFontFamily("Times New Roman"); run.setFontSize(10); if (boldFlag) { run.setBold(true); } if (StringUtils.isNotEmpty(color)) { run.setColor(color); } if (supFlag) { run.setSubscript(VerticalAlign.SUPERSCRIPT); } if (subFlag) { run.setSubscript(VerticalAlign.SUBSCRIPT); } if (ulFlag && StringUtils.isNotEmpty(nodeValue.trim())) { XWPFParagraph paragraph = doc.createParagraph(); paragraph.setIndentFromLeft(0); paragraph.setFirstLineIndent(0); paragraph.setIndentationLeftChars(125); XWPFRun run1 = paragraph.createRun(); run1.setFontFamily("Times New Roman"); run1.setFontSize(8); run1.setText("●"); run1.addTab(); XWPFRun run2 = paragraph.createRun(); run2.setText(nodeValue.trim()); run2.setFontFamily("Times New Roman"); run2.setFontSize(10); } if (StringUtils.isNotEmpty(nodeValue) && !ulFlag) { run.setText(nodeValue.trim()); } } boolean enabledBreak = ReUtil.isMatch("(|h[12345]|li|img|br)", node.nodeName().toLowerCase()); if (enabledBreak) { XWPFRun run = xwpfParagraph.createRun(); run.addCarriageReturn(); } } public static String simplifyTable(String tableContent) { if (StringUtils.isEmpty(tableContent)) { return null; } org.jsoup.nodes.Document tableDoc = Jsoup.parse(tableContent); Elements trElements = tableDoc.getElementsByTag("tr"); // 针对于colspan操作 for (Element trElement : trElements) { //去除所有样式 trElement.removeAttr("class"); Elements tdElements = trElement.getElementsByTag("td"); List<Element> tdEleList = covertElements2List(tdElements); for (Element curTdElement : tdEleList) { //去除所有样式 curTdElement.removeAttr("class"); Element ele = curTdElement.clone(); String colspanValStr = curTdElement.attr("colspan"); if (!StringUtils.isEmpty(colspanValStr)) { ele.removeAttr("colspan"); int colspanVal = Integer.parseInt(colspanValStr); for (int k = 0; k < colspanVal - 1; k++) { curTdElement.after(ele.outerHtml()); } } } } // 针对于rowspan操作 List<Element> trEleList = covertElements2List(trElements); Element firstTrEle = trElements.first(); if (null == firstTrEle){ return ""; } Elements tdElements = firstTrEle.getElementsByTag("td"); Integer tdCount = tdElements.size(); //获取该列下所有单元格 for (int i = 0; i < tdElements.size(); i++) { for (Element trElement : trEleList) { List<Element> tdElementList = covertElements2List(trElement.getElementsByTag("td")); Node curTdNode = tdElementList.get(i); Node cNode = curTdNode.clone(); String rowspanValStr = curTdNode.attr("rowspan"); if (!StringUtils.isEmpty(rowspanValStr)) { cNode.removeAttr("rowspan"); Element nextTrElement = trElement.nextElementSibling(); int rowspanVal = Integer.parseInt(rowspanValStr); for (int j = 0; j < rowspanVal - 1; j++) { Node tempNode = cNode.clone(); List<Node> nodeList = new ArrayList<Node>(); nodeList.add(tempNode); if (j > 0 && null != nextTrElement) { nextTrElement = nextTrElement.nextElementSibling(); } Integer indexNum = i + 1; if (i == 0) { indexNum = 0; } if (null != nextTrElement){ if (indexNum.equals(tdCount)) { nextTrElement.appendChild(tempNode); } else { nextTrElement.insertChildren(indexNum, nodeList); } } } } } } Element tableEle = tableDoc.getElementsByTag("table").first(); if (null == tableEle){ return ""; } return tableEle.outerHtml(); } private static List<Element> covertElements2List(Elements curElements) { return new ArrayList<>(curElements); }}