Newer
Older
OCRTest / src / TextUtils.java
ubt on 30 May 2018 6 KB init
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class TextUtils {

    public static String clearFormat(String str){
        if (str.length()>120){
            return "";
        }
        str = str.replace(" ", "");
//        if (str.contains("|")){
//            str = str.split("\\|")[0];
//        }
        str = str.replace("|", "");
        str = str.replace("”", "");
        str = str.replace("“", "");
        str = str.replace("\n", "");


        return str;
    }

    /**
     * 检查输入的数据中是否有除了汉字字母和数字以外的字符
     * @param qString 要检查的数据
     * @return boolean 如果包含正则表达式 <code> regx </code> 中定义的特殊字符,返回true;
     * 否则返回false
     */
    public static boolean hasCrossScriptRisk(String qString) {
        if (qString!=null) {
            qString = qString.trim();
//            String regex = "^[a-zA-Z0-9\u4E00-\u9FA5]+$";
            String regex = "^[a-zA-Z0-9\\u4E00-\\u9FA5]{2,}";
            Pattern pattern = Pattern.compile(regex);
            Matcher match = pattern.matcher(qString);
            String r = qString.replaceAll("([^(0-9\\u4E00-\\u9FA5)]{2,})|(\\s)", "");
            System.out.println(r);
            boolean b = match.matches();
            if (b) {
                System.out.println(">>> 1");
                return false;
            } else {
                System.out.println(">>> 2");
                return true;
            }
        }
        return false;
    }


    /**
     * 去除汉字、数字、单个英文字母以外的内容
     * 去除空格
     *
     * @param address
     * @return
     */
    public static String formatAddress(String address){
        if (!address.isEmpty()){
            //去除汉字、数字、单个英文字母以外的内容 + 去除空格
            address = address.replaceAll("([^(0-9\\u4E00-\\u9FA5)]{2,})|(\\s)", "");
            //判断是否包含中文,如果没有就返回空
            Pattern pattern = Pattern.compile("[\\u4E00-\\u9FA5]");
            Matcher matcher = pattern.matcher(address);
            boolean b = matcher.find();
            if (b){
                return address;
            } else {
                return "";
            }
        }
        return "";
    }

    /**
     * 如果同时包含中文+字母+符号,则置空(只要带标点就置空)
     *
     * @param shareInfo
     * @return
     */
    public static String formatShare(String shareInfo){
        if (!shareInfo.isEmpty()){
            //去除空白内容
            shareInfo = shareInfo.replaceAll("\\s", "");
            //判断是否有除中文、字母以外的内容,有的话就置空
            Pattern pattern = Pattern.compile("[^(\\w\\u4E00-\\u9FA5)]");
            Matcher matcher = pattern.matcher(shareInfo);
            if (matcher.find()){
                return "";
            } else {
                return shareInfo;
            }

        }
        return "";
    }

    /**
     * 取头部不动产权号
     *
     * @param header
     * @return
     */
    public static String formatHeader(String header){
        if (!header.isEmpty()){
            header = header.replaceAll("O|o", "0");
            header = header.replaceAll("l", "1");
            header = header.replaceAll("s|S", "3");
            header = header.replaceAll("一|二", "");
            //去除汉字、数字、单个英文字母以外的内容 + 去除空格
            header = header.replaceAll("([^(0-9\\u4E00-\\u9FA5)])|(\\s)", "");
            //取省份首字
            String s = "";
            Pattern pattern = Pattern.compile("(.*)\\(");
            Matcher matcher = pattern.matcher(header);
            if (matcher.find()){
                s = matcher.group();
            } else {
                pattern = Pattern.compile("(.*)\\)");
                matcher = pattern.matcher(header);
                if (matcher.find()){
                    s = matcher.group();
                }
            }
            String regExp1 = "(京|津|冀|晋|蒙|辽|吉|黑|沪|苏|浙|皖|闽|赣|鲁|豫|鄂|湘|粤|桂|琼|渝|川|蜀|黔|贵|滇|云|藏|陕|秦|甘|陇|青|宁|新|港|澳|台)";
            matcher = pattern.compile(regExp1).matcher(s.isEmpty()?header:s);
            if (matcher.find()){
                s = matcher.group();
            } else {
                s="";
            }
            System.out.println("s= " + s);


            //取年份
            String y = "";
            matcher = Pattern.compile("\\([0-9]+\\)").matcher(header);
            if (matcher.find()){
                y = matcher.group();
            } else {
                matcher = Pattern.compile("\\([0-9]+").matcher(header);
                if (matcher.find()) {
                    y = matcher.group()+")";
                } else {
                    matcher = Pattern.compile("[0-9]+\\)").matcher(header);
                    if (matcher.find()) {
                        y = "("+matcher.group();
                    }
                }
            }
            if (y.isEmpty()){
                matcher = Pattern.compile(".*(不|动|产|权|第)").matcher(header);
                if (matcher.find()) {
                    y = matcher.group();
                    matcher = Pattern.compile("[0-9]+").matcher(y);
                    if (matcher.find()) {
                        y = "("+matcher.group()+")";
                    } else {
                        y = "";
                    }
                }
            }
            System.out.println("y= " + y);

            //取不动产权
            String c = "";
            matcher = Pattern.compile("[(\\u4E00-\\u9FA5)]+?(不|动|产|权|第)").matcher(header);
            if (matcher.find()) {
                c = matcher.group();
                c = c.substring(0, c.length()-1);
                c = c.replaceAll("\\(|\\)", "");
                c += "不动产权第";
            }
            System.out.println("c= " + c);

            //取产权号
            String n = "";
            matcher = Pattern.compile("(不|动|产|权|第)([0-9]+)").matcher(header);
            if (matcher.find()){
                n = matcher.group(2) + "号";
            }
            System.out.println("n= " + n);

            return s+y+c+n;
        }
        return header;
    }

    public static void main(String args[]){
        String str = "一伟(iegyPRieee4";
        System.out.println(str);
        System.out.println(formatHeader(str));
    }
}