Newer
Older
OCRTest / src / Main.java
ubt on 1 Jun 2018 4 KB text + baidu
import com.alibaba.fastjson.JSON;
import net.sourceforge.tess4j.ITesseract;
import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.TesseractException;
import utils.CmdRun;
import utils.ImageUtils;
import utils.TextUtils;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.util.ArrayList;
import java.util.List;

public class Main {

    public static String path = "/home/ubt/ocr/bdc_baidu.jpg";

    public static void main(String[] args) throws Exception {
        long start = System.currentTimeMillis();
        //结果
        List<String> resultList = new ArrayList<>();

        int areaWidth = 960;   //每行截取的高度
        int areaHeight = 122;   //每行截取的宽度
        int fromX = 236;        //表格截取的原点X
        int fromY = 142;        //表格截取的原点Y

        //载入图片

        File imageFile = new File(path);
        if (!imageFile.exists()) {
            System.out.println("======== don't find this file! ========");
            return;
        }

        //缩放图片
        BufferedImage image = ImageIO.read(imageFile);
        if (image.getWidth() != 1280) {
            imageFile = ImageUtils.zoomImage(imageFile, 1280, 1930);
        }

        //执行去色命令(需要有ImageMagick环境)
        CmdRun.run("/home/ubt/ocr/textcleaner -g -o 10 -t 20 "+path+" "+path+"");

        //配置Tesseract
        ITesseract iTesseract = new Tesseract();
        iTesseract.setDatapath("./tessdata/");
        iTesseract.setLanguage("eng+chi_sim");

        try {

            String result1 = iTesseract.doOCR(ImageUtils.cutAreaImage(imageFile, 25, 20, 1188, areaHeight));
//            String result1 = iTesseract.doOCR(imageFile, new Rectangle(50, 40, 2425, areaHeight));

            resultList.add("0-不动产权= "+TextUtils.formatHeader(result1));

            for (int i = 0; i < 9; i++) {
                String result = iTesseract.doOCR(ImageUtils.cutAreaImage(imageFile, fromX, fromY + (i * areaHeight), areaWidth, areaHeight));
//                String result = iTesseract.doOCR(imageFile, new Rectangle(fromX, fromY + (i * areaHeight), areaWidth, areaHeight));

                switch (i) {
                    case 0:
                        resultList.add("1-权利人= "+TextUtils.formatName(result));
                        break;

                    case 1:
                        resultList.add("2-共有情况= "+TextUtils.formatShare(result));
                        break;

                    case 2:
                        resultList.add("3-坐落= "+TextUtils.formatAddress(result));
                        break;

                    case 3:
                        resultList.add("4-不动产单元号= "+TextUtils.formatNum(result));
                        break;

                    case 4:
                        resultList.add("5-权利类型= "+TextUtils.formatType(result));
                        break;

                    case 5:
                        resultList.add("6-权利性质= "+TextUtils.formatHouse(result));
                        break;

                    case 6:
                        resultList.add("7-用途= "+TextUtils.formatUse(result));
                        break;

                    case 7:
                        resultList.add("8-面积= "+TextUtils.formatArea(result));
                        break;

                    case 8:
                        resultList.add("9-使用期限= "+TextUtils.formatLimit(result));
                        break;

                }

            }


            String result11 = iTesseract.doOCR(ImageUtils.cutAreaImage(imageFile, 236, 1234, areaWidth, 665));
//            String result11 = iTesseract.doOCR(imageFile, new Rectangle(476, 2490, areaWidth, 1300));
            resultList.add("10-权利其他状况= "+TextUtils.formatOthers(result11));

            System.out.println(JSON.toJSONString(resultList));

            long end = System.currentTimeMillis();
            System.out.println("用时: " + (end - start));

//            iTesseract.createDocuments(imageFile.getPath(), "a", Collections.singletonList(ITesseract.RenderedFormat.TEXT));
        } catch (TesseractException e) {
            e.printStackTrace();
        }
    }


}