Newer
Older
OCRTest / src / Main.java
ubt on 30 May 2018 3 KB init
import net.sourceforge.tess4j.ITesseract;
import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.TesseractException;

import javax.imageio.ImageIO;
import java.awt.*;
import java.awt.image.BufferedImage;
import java.io.File;
import java.util.HashMap;
import java.util.Map;

public class Main {

    public static void main(String[] args) throws Exception {
        long start = System.currentTimeMillis();
        //结果
        Map<String, String> resultMap = new HashMap<>();

        int areaWidth = 1978;   //每行截取的高度
        int areaHeight = 245;   //每行截取的宽度
        int fromX = 476;        //表格截取的原点X
        int fromY = 285;        //表格截取的原点Y

        File imageFile = new File("/home/ubt/ocr/bdc2.jpg");
        if (!imageFile.exists()) {
            System.out.println("======== don't find this file! ========");
            return;
        }

        BufferedImage image = ImageIO.read(imageFile);
        if (image.getWidth() != 2590) {
            imageFile = ImageUtils.zoomImage(imageFile, 2590, 3903);
        }

        ITesseract iTesseract = new Tesseract();
        iTesseract.setDatapath("./tessdata/");
        iTesseract.setLanguage("eng+chi_sim");

        try {

//            String result1 = iTesseract.doOCR(ImageUtils.cutAreaImage(imageFile, 50, 40, 2425, areaHeight));
            String result1 = iTesseract.doOCR(imageFile, new Rectangle(50, 40, 2425, areaHeight));
            resultMap.put("0-不动产权", TextUtils.formatHeader(result1));
            System.out.println(TextUtils.clearFormat(result1));


            for (int i = 0; i < 9; i++) {
//                String result = iTesseract.doOCR(ImageUtils.cutAreaImage(imageFile, fromX, fromY + (i * areaHeight), areaWidth, areaHeight));
                String result = iTesseract.doOCR(imageFile, new Rectangle(fromX, fromY + (i * areaHeight), areaWidth, areaHeight));

                switch (i) {
                    case 0:
                        resultMap.put("1-权利人", TextUtils.clearFormat(result));
                        break;

                    case 1:
                        resultMap.put("2-共有情况", TextUtils.formatShare(result));
                        break;

                    case 2:
                        resultMap.put("3-坐落", TextUtils.formatAddress(result));
                        break;

                    case 3:
                        resultMap.put("4-不动产单元号", TextUtils.clearFormat(result));
                        break;

                    case 4:
                        resultMap.put("5-权利类型", TextUtils.clearFormat(result));
                        break;

                    case 5:
                        resultMap.put("6-权利性质", TextUtils.clearFormat(result));
                        break;

                    case 6:
                        resultMap.put("7-用途", TextUtils.clearFormat(result));
                        break;

                    case 7:
                        resultMap.put("8-面积", TextUtils.clearFormat(result));
                        break;

                    case 8:
                        resultMap.put("9-使用期限", TextUtils.clearFormat(result));
                        break;

                }

            }


//            String result11 = iTesseract.doOCR(ImageUtils.cutAreaImage(imageFile, 476, 2490, areaWidth, 1300));
            String result11 = iTesseract.doOCR(imageFile, new Rectangle(476, 2490, areaWidth, 1300));
            resultMap.put("10-权利其他状况", TextUtils.clearFormat(result11));

            System.out.println(resultMap.toString());

            long end = System.currentTimeMillis();
            System.out.println("用时: " + (end - start));

//            iTesseract.createDocuments(imageFile.getPath(), "a", Collections.singletonList(ITesseract.RenderedFormat.TEXT));
        } catch (TesseractException e) {
            e.printStackTrace();
        }
    }


}