Commit 63816961 authored by pengxin's avatar pengxin

调整清洗数据

parent 44b91781
...@@ -37,10 +37,7 @@ import org.springframework.transaction.annotation.Transactional; ...@@ -37,10 +37,7 @@ import org.springframework.transaction.annotation.Transactional;
import java.io.File; import java.io.File;
import java.io.FileWriter; import java.io.FileWriter;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.*;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Future; import java.util.concurrent.Future;
import java.util.stream.Collectors; import java.util.stream.Collectors;
...@@ -251,9 +248,16 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp ...@@ -251,9 +248,16 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
List<DatasetRule> rules = new ArrayList<>(); List<DatasetRule> rules = new ArrayList<>();
if(null != datasetCleanConfig) { if(null != datasetCleanConfig) {
String[] jsonStrings = {datasetCleanConfig.getFilterConfig(),datasetCleanConfig.getDesensitiveConfig(), String[] nonEmptyJsonStrings = {datasetCleanConfig.getFilterConfig(),datasetCleanConfig.getDesensitiveConfig(),
datasetCleanConfig.getDesensitiveConfig(),datasetCleanConfig.getDeduplicateConfig(), datasetCleanConfig.getDesensitiveConfig(),datasetCleanConfig.getDeduplicateConfig(),
datasetCleanConfig.getCleanConfig()}; datasetCleanConfig.getCleanConfig()};
String[] jsonStrings = Arrays.stream(nonEmptyJsonStrings)
.map(Optional::ofNullable)
.filter(Optional::isPresent)
.map(Optional::get)
.toArray(String[]::new);
ObjectMapper objectMapper = new ObjectMapper(); ObjectMapper objectMapper = new ObjectMapper();
rules = Arrays.stream(jsonStrings) rules = Arrays.stream(jsonStrings)
.map(jsonString -> { .map(jsonString -> {
......
...@@ -180,7 +180,7 @@ public class DataCleanerUtil { ...@@ -180,7 +180,7 @@ public class DataCleanerUtil {
// 计算词重复率 // 计算词重复率
double repetitionRate = (double) repeatedWordsCount / totalWords; double repetitionRate = (double) repeatedWordsCount / totalWords;
return repetitionRate < threshold ? document : DatasetConstant.EMPTY_STR; return repetitionRate > threshold ? DatasetConstant.EMPTY_STR : document;
} }
/** /**
...@@ -215,7 +215,8 @@ public class DataCleanerUtil { ...@@ -215,7 +215,8 @@ public class DataCleanerUtil {
// 计算字重复率 // 计算字重复率
double repetitionRate = (double) repeatedCharactersCount / totalCharacters; double repetitionRate = (double) repeatedCharactersCount / totalCharacters;
return repetitionRate < threshold ? text : DatasetConstant.EMPTY_STR; //根据阀值判断进行数据返回
return repetitionRate > threshold ? DatasetConstant.EMPTY_STR : text;
} }
/** /**
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment