2016-04-21

Html不失真转换成Word文件

阅读次数:次

内容不失真转换成word文件，需要先将内容转换成html格式，然后利用POI对html直接转换成word文件，这里不失真的意思是指：对于html中的字体，字号，样式和图片不失真地转换到word中。

将内容导出html

这一部分没有难点，主要讲利用导出的html转成word内容，这里需要注意的是在导出到html的时候，一要注意中文编码问题，
另外注意对字体字号等进行声明：
WORD中对字体的大小同时采用了两种不同的度量单位，其一是我们中国人所熟悉的“字号”，另外一种则是以“磅”为度量单位。
这两种度量字大小的单位之间是什么样的关系呢？下面就是二者的对应关系：

磅	字号
42	初号
36	小初
26	一号
24	小一号
22	二号
18	小二号
16	三号
15	小三号
14	四号
12	小四号
10.5	五号
9	小五号
7.5	六号
6.5	小六号
5.5	七号
5	八号

在html中pt 代表磅的单位。

另外一点需要注意，如果我的html中包含有图片，这个图片地址如果是网站外链，不需要做处理；如果是带有防盗链的图片地址
，还需要将图片的防盗链有效化，也就是要将图片中防盗链信息更换成有效的，比如我的防盗链中包含了sessionid ,
由于有效期只是在会话期间，所以这里需要需要把sessionid换成当前用户的有效sessionid .

上代码：

package com.wwl.util.html2word;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.Iterator;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * 将内容导出到html,避免图片失真
 * 
 * 需要html将图片中的防盗链有效化，这样导出的word中也会包含有效图片
 * 
 * @author 王文路
 * @date 2015-7-22
 */
public class Content2HtmlHelper {


/**
 *  要转换的内容
 */

private String content; 
/**
 * 转换后生成的html保存文件路径，以.html结尾
 */

private String filePath ;

/**
 * 如： http://localhost/test/api/{sessionid}，用于解除图片防盗链
 * 因为sessionid每次登陆都不一样，如果我的content中包含本系统带有防盗链的图片，需要处理一下
 */
private String basePath;    

public Content2HtmlHelper(String content , String filePath ,String basePath) {
    super();
    this.content = content;
    this.filePath = filePath;
    this.basePath = basePath;
}

private boolean import2Html(){

    StringBuffer buf = new StringBuffer();

    // 1 添加html文件头，注意编码方式utf-8
    buf.append("<!DOCTYPE html><html><head>")
    .append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">")
    .append("</head><body>");

    // 2 添加头文字，固定，字体对应到word中，16磅对应word中三号字体
    buf.append("<p style='font-size:16.0pt;font-family:黑体'>测试文件</p>");

    // 3 将图片的防盗链有效化
    org.jsoup.nodes.Document doc = Jsoup.parse(this.content);  
    Elements els = doc.getElementsByTag("img");

    // 3.1 解析图片地址
    Iterator<Element> it = els.iterator();
    while(it.hasNext()) {

        Element el = it.next();

        String src = el.attr("src");

        el.attr("src", this.basePath + src.substring(36));
    }

    // 3.2 解析后， 取出body内容(由于Jsoup会自动加上html和head等标签，我们只需要body的内容)
    Elements head =  doc.getElementsByTag("body");
    buf.append( head.iterator().next().html() );

    // 4 
    buf.append("</body></html>");

    FileOutputStream fos = null;   
    BufferedWriter bw = null;  

    try {

        // 5 写入html文件
        File file = new File(this.filePath);
        fos = new FileOutputStream(file);
        bw = new BufferedWriter(new OutputStreamWriter(fos, "UTF-8"));
        bw.write(buf.toString());

        System.out.println("将信息点转换html文件完成");

        return true;
    } catch (FileNotFoundException e) {

        e.printStackTrace();

    }catch (IOException ioe) { 

        ioe.printStackTrace();  

    } finally {  
        try {  
            if (bw != null)  
                bw.close();  
            if (fos != null)  
                fos.close();  
        } catch (IOException ie) {  
        }  
    }  

    return false;
}

/**
 * 单元测试
 * 
 * @author 王文路
 * @date 2015-7-23
 * @param args
 */
public static void main(String[] args){

    new Content2HtmlHelper("测试" , "G:\\123.html" , null).import2Html();
}
}

利用POI框架将html转成word

poi提供了将内容转成word的语法：

POIFSFileSystem poifs = new POIFSFileSystem();
    DirectoryEntry directory = poifs.getRoot();
    directory.createDocument(
            "WordDocument", is);

    fos = new FileOutputStream(this.outputPath);
    poifs.writeFilesystem(fos);

那么这里内容既然直接就可以转成html，为什么还要先转成html呢？
这里是避免转成word的时候失真，而且在控制字体大小颜色，表格样式，图片大小和位置，直接通过POI进行更改格式比较麻烦。
这里就是先把内容，布局，样式等做好，转成html，然后利用POI将html转成word的时候是转成web大纲样式的，所以失真较少。

上代码：

package com.wwl.util.html2word;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;

import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;

/**
 * 将html文件转换成word文件
 * 
 * @author 王文路
 * @date 2015-7-23
 */
public class Html2DocConverter {

private String inputPath;    // 输入文件路径，以.html结尾
private String outputPath;    // 输出文件路径，以.doc结尾

public Html2DocConverter(String inputPath, String outputPath) {
    super();
    this.inputPath = inputPath;
    this.outputPath = outputPath;
}

/**
 * 读取html文件到word
 * 
 * @param filepath
 *            html文件的路径
 * @return
 * @throws Exception
 */
public boolean writeWordFile() throws Exception {

    InputStream is = null;
    FileOutputStream fos = null;

    // 1 找不到源文件, 则返回false
    File inputFile = new File(this.inputPath);
    if (!inputFile.exists()) {
        return false;
    }

    File outputFile = new File(this.outputPath);
    // 2 如果目标路径不存在 则新建该路径
    if (!outputFile.getParentFile().exists()) {
        outputFile.getParentFile().mkdirs();
    }

    try {

        // 3 将html文件内容写入doc文件
        is = new FileInputStream(inputFile);
        POIFSFileSystem poifs = new POIFSFileSystem();
        DirectoryEntry directory = poifs.getRoot();
        directory.createDocument(
                "WordDocument", is);

        fos = new FileOutputStream(this.outputPath);
        poifs.writeFilesystem(fos);

        System.out.println("转换word文件完成!");

        return true;
    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        if (fos != null)
            fos.close();
        if (is != null)
            is.close();
    }

    return false;
}
public static void main(String[] args) throws Exception {

    new Html2DocConverter("G:/123.html" , "G:/temp5.doc").writeWordFile();
}
}

源码下载地址