Commit 97a9b122 authored by pengxin's avatar pengxin

清洗开关调整。

parent 4093e040
......@@ -40,14 +40,22 @@ public class DataCleanerUtil {
public static String buildCleanAfterData(String data, List<DatasetRule> rules) {
if(StringUtils.isEmpty(data)) return DatasetConstant.EMPTY_STR;
StringBuilder sb = new StringBuilder();
int count = 0;
for (DatasetRule rule : rules) {
if(rule.getArgs() > 0) {
data = buildJsonData(rule.getName(), data, rule.getArgs());
sb.append(data);
count ++ ;
}
}
return sb.toString();
//表示没有清洗指标
if(count == 0) return DatasetConstant.EMPTY_STR;
return data;
}
public static void main(String [] args) {
String data = "我们是中国人,我们是地球人,<AUTHOR>我们要团结起来。团结就是力量,力量就是一切。</AUTHOR>";
data = data.replaceAll("<[^>]*>", "");
System.out.println(data);
}
/**
......@@ -105,12 +113,6 @@ public class DataCleanerUtil {
return data;
}
public static void main(String [] args) {
String data = "我们是中国人,我们是地球人,我们要团结起来。团结就是力量,力量就是一切。";
data = filterWordRepetition(data,0.3);
System.out.println(data);
}
/**
* 计算阀值
* @param data 计算坏的数据
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment