Commit 7f447ff0 authored by pengxin's avatar pengxin

数据集清洗新增开关模块。

parent cc809ff6
...@@ -70,6 +70,11 @@ ...@@ -70,6 +70,11 @@
<artifactId>opencc4j</artifactId> <artifactId>opencc4j</artifactId>
<version>1.6.2</version> <version>1.6.2</version>
</dependency> </dependency>
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>portable-1.8.2</version>
</dependency>
</dependencies> </dependencies>
<build> <build>
......
...@@ -50,4 +50,29 @@ public class DatasetCleanConstant { ...@@ -50,4 +50,29 @@ public class DatasetCleanConstant {
*/ */
public static final String REPLACE_IDENTIFIER = "replace_identifier"; public static final String REPLACE_IDENTIFIER = "replace_identifier";
/**
* 检查文档的词数目
*/
public static final String FILTER_CHECK_NUMBER_WORDS = "filter_check_number_words";
/**
* 检查文档的字重复率
*/
public static final String FILTER_CHECK_WORD_REPETITION_REMOVAL = "filter_check_word_repetition_removal";
/**
* 检查文档的词重复率
*/
public static final String FILTER_CHECK_CHARACTER_REPETITION_REMOVAL = "filter_check_character_repetition_removal";
/**
* 检查文档的特殊字符率
*/
public static final String FILTER_CHECK_SPECIAL_CHARACTERS = "filter_check_special_characters";
/**
* 检查文档的色情暴力词率
*/
public static final String FILTER_CHECK_FLAGGED_WORDS = "filter_check_flagged_words";
} }
...@@ -7,6 +7,11 @@ public class DatasetConstant { ...@@ -7,6 +7,11 @@ public class DatasetConstant {
*/ */
public static final Integer STATUS_UNPUBLISHED = 0; public static final Integer STATUS_UNPUBLISHED = 0;
/**
* 已导入
*/
public static final Integer INPUT_STATUS = 1;
/** /**
* 已发布状态 * 已发布状态
*/ */
...@@ -62,6 +67,21 @@ public class DatasetConstant { ...@@ -62,6 +67,21 @@ public class DatasetConstant {
*/ */
public static final String OUTPUT = "output"; public static final String OUTPUT = "output";
/**
* args参数值
*/
public static final String ARGS = "args";
/**
* args参数值
*/
public static final String NAME = "name";
/**
* 关状态
*/
public static final String CLOSED = "0";
/** /**
* data数据 * data数据
*/ */
...@@ -82,6 +102,16 @@ public class DatasetConstant { ...@@ -82,6 +102,16 @@ public class DatasetConstant {
*/ */
public static final Integer CLEAN_FINISHED = 1; public static final Integer CLEAN_FINISHED = 1;
/**
* 暂停清洗
*/
public static final Integer PAUSE_FINISHED = 2;
/**
* 空白字符
*/
public static final String EMPTY_STR = "";
/** /**
* 文本数据清洗 * 文本数据清洗
*/ */
......
...@@ -362,7 +362,7 @@ public class DatasetVersionController { ...@@ -362,7 +362,7 @@ public class DatasetVersionController {
//再存储数据集配置文件 //再存储数据集配置文件
datasetVersionService.saveDatasetInfo(versionName); datasetVersionService.saveDatasetInfo(versionName);
datasetVersion.setFileUrl(fullName); datasetVersion.setFileUrl(fullName);
datasetVersion.setInputStatus(1); datasetVersion.setInputStatus(DatasetConstant.INPUT_STATUS);
datasetVersion.setDataVolume(Long.valueOf(JSON.parseArray(new String(importFile.getBytes(), StandardCharsets.UTF_8)).size())); datasetVersion.setDataVolume(Long.valueOf(JSON.parseArray(new String(importFile.getBytes(), StandardCharsets.UTF_8)).size()));
this.datasetVersionService.updateById(datasetVersion); this.datasetVersionService.updateById(datasetVersion);
return ResponseResult.success(); return ResponseResult.success();
......
...@@ -27,11 +27,17 @@ public class DatasetDataFilter { ...@@ -27,11 +27,17 @@ public class DatasetDataFilter {
@Field("clean_id") @Field("clean_id")
private Long cleanId; private Long cleanId;
/**
* 清洗前数据
*/
@Field("clean_before_data")
private String cleanBeforeData;
/** /**
* 清洗后数据 * 清洗后数据
*/ */
@Field("content") @Field("clean_after_data")
private String content; private String cleanAfterData;
/** /**
* 创建时间 * 创建时间
......
package com.yice.webadmin.app.data;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
@AllArgsConstructor
public class DatasetRule {
/**
* 规则名称
*/
private String name;
/**
* 版本标识
*/
private double args;
@Override
public String toString() {
return "DatasetRule{" +
"args=" + args +
", name='" + name + '\'' +
'}';
}
}
...@@ -64,7 +64,7 @@ public class DatasetCleanDto { ...@@ -64,7 +64,7 @@ public class DatasetCleanDto {
/** /**
* 清洗状态。 * 清洗状态。
*/ */
@ApiModelProperty(value = "清洗状态:0:进行中;1:已完成") @ApiModelProperty(value = "清洗状态;0:未清洗;1:已清洗;2:暂停清洗")
private Integer cleanStatus; private Integer cleanStatus;
/** /**
......
...@@ -29,10 +29,16 @@ public class DatasetDataFilterDto { ...@@ -29,10 +29,16 @@ public class DatasetDataFilterDto {
private Long cleanId; private Long cleanId;
/** /**
* 过滤内容 * 清洗前数据
*/ */
@ApiModelProperty(value = "过滤内容") @ApiModelProperty(value = "清洗前数据")
private String content; private String cleanBeforeData;
/**
* 清洗后数据。
*/
@ApiModelProperty(value="清洗后数据")
private String cleanAfterData;
/** /**
* 创建时间。 * 创建时间。
......
...@@ -244,7 +244,8 @@ public class DatasetCleanConfigServiceImpl extends BaseService<DatasetCleanConfi ...@@ -244,7 +244,8 @@ public class DatasetCleanConfigServiceImpl extends BaseService<DatasetCleanConfi
List<Document> documents = new ArrayList<>(); List<Document> documents = new ArrayList<>();
if(CollUtil.isNotEmpty(filters)) { if(CollUtil.isNotEmpty(filters)) {
for(DatasetDataFilter filter : filters) { for(DatasetDataFilter filter : filters) {
Document document = new Document(MongoConstant.CONTENT, filter.getContent()) Document document = new Document(MongoConstant.CLEAN_BEFORE_DATA, filter.getCleanBeforeData())
.append(MongoConstant.CLEAN_AFTER_DATA, filter.getCleanAfterData())
.append(MongoConstant.CLEAN_ID, filter.getCleanId()) .append(MongoConstant.CLEAN_ID, filter.getCleanId())
.append(MongoConstant.CREATE_TIME, new Date()); .append(MongoConstant.CREATE_TIME, new Date());
documents.add(document); documents.add(document);
......
...@@ -27,6 +27,7 @@ import com.yice.webadmin.app.service.DatasetVersionService; ...@@ -27,6 +27,7 @@ import com.yice.webadmin.app.service.DatasetVersionService;
import com.yice.webadmin.app.util.DataCleanerUtil; import com.yice.webadmin.app.util.DataCleanerUtil;
import com.yice.webadmin.app.util.JsonNameExtractor; import com.yice.webadmin.app.util.JsonNameExtractor;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Async; import org.springframework.scheduling.annotation.Async;
import org.springframework.scheduling.annotation.AsyncResult; import org.springframework.scheduling.annotation.AsyncResult;
...@@ -37,6 +38,7 @@ import java.io.File; ...@@ -37,6 +38,7 @@ import java.io.File;
import java.io.FileWriter; import java.io.FileWriter;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
...@@ -176,6 +178,13 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp ...@@ -176,6 +178,13 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
Future<?> future = futures.remove(cleanId); Future<?> future = futures.remove(cleanId);
if (future != null && !future.isDone()) { if (future != null && !future.isDone()) {
future.cancel(true); future.cancel(true);
//暂停清洗
DatasetClean filter = new DatasetClean();
filter.setCleanStatus(DatasetConstant.PAUSE_FINISHED);
filter.setFinishTime(null);
filter.setCleanId(cleanId);
this.updateById(filter);
} }
} }
...@@ -204,26 +213,13 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp ...@@ -204,26 +213,13 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
*/ */
private void dealWithTaskHandler(Long datasetId, Long cleanId) { private void dealWithTaskHandler(Long datasetId, Long cleanId) {
try { try {
List<String> rules = new ArrayList<>();
DatasetCleanConfig filter = new DatasetCleanConfig();
filter.setCleanId(cleanId);
DatasetCleanConfig config = datasetCleanConfigService.getOne(filter);
if(null != config) {
rules.add(config.getFilterConfig());
rules.add(config.getDesensitiveConfig());
rules.add(config.getDeduplicateConfig());
rules.add(config.getCleanConfig());
rules = rules.stream()
.filter(rule -> rule != null && !rule.isEmpty())
.collect(Collectors.toList());
rules = JsonNameExtractor.extractNames(rules);
}
DatasetVersion datasetVersion = this.datasetVersionService.getById(datasetId); DatasetVersion datasetVersion = this.datasetVersionService.getById(datasetId);
datasetVersionService.saveDatasetInfo(datasetVersion.getVersionName()); datasetVersionService.saveDatasetInfo(datasetVersion.getVersionName());
clearFileDatasetData(datasetVersion.getFileUrl()); clearFileDatasetData(datasetVersion.getFileUrl());
Long count = datasetDataService.count(datasetId); Long count = datasetDataService.count(datasetId);
if (count > 0) {
List<DatasetRule> rules = buildRulesList(cleanId);
int pageSize = DatasetConstant.MAX_SIZE; int pageSize = DatasetConstant.MAX_SIZE;
int totalPages = (int) Math.ceil((double) count / pageSize); int totalPages = (int) Math.ceil((double) count / pageSize);
MyPageParam param; MyPageParam param;
...@@ -232,14 +228,48 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp ...@@ -232,14 +228,48 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
param.setPageNum(i); param.setPageNum(i);
param.setPageSize(pageSize); param.setPageSize(pageSize);
List<DatasetData> dataList = datasetDataService.list(datasetId, param); List<DatasetData> dataList = datasetDataService.list(datasetId, param);
dealWithDatasetNodeData(dataList, datasetId, rules); List<DatasetData> newDataList = dealWithDatasetNodeData(dataList, datasetId, rules);
appendDataListToFile(datasetVersion.getFileUrl() ,dataList); if(CollUtil.isNotEmpty(newDataList)) {
appendDataListToFile(datasetVersion.getFileUrl() ,newDataList);
}
}
} }
} catch (Exception ex) { } catch (Exception ex) {
log.error("deal with task handler is error:" , ex); log.error("deal with task handler is error:" , ex);
} }
} }
/**
* 构建规则列表
* @param cleanId 清洗标识
* @return 规则列表
*/
private List<DatasetRule> buildRulesList(Long cleanId) {
DatasetCleanConfig cleanConfig = new DatasetCleanConfig();
cleanConfig.setCleanId(cleanId);
DatasetCleanConfig datasetCleanConfig = datasetCleanConfigService.getOne(cleanConfig);
List<DatasetRule> rules = new ArrayList<>();
if(null != datasetCleanConfig) {
String[] jsonStrings = {datasetCleanConfig.getFilterConfig(),datasetCleanConfig.getDesensitiveConfig(),
datasetCleanConfig.getDesensitiveConfig(),datasetCleanConfig.getDeduplicateConfig(),
datasetCleanConfig.getCleanConfig()};
ObjectMapper objectMapper = new ObjectMapper();
rules = Arrays.stream(jsonStrings)
.map(jsonString -> {
try {
return objectMapper.readValue(jsonString, DatasetRule[].class);
} catch (JsonProcessingException e) {
log.error("json processing exception is error:", e);
return null;
}
})
.flatMap(Arrays::stream)
.collect(Collectors.toList());
}
return rules;
}
/** /**
* 第一个方法:清空文件 * 第一个方法:清空文件
* @param filePath 文件地址 * @param filePath 文件地址
...@@ -299,7 +329,9 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp ...@@ -299,7 +329,9 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
* @param datasetId 数据集标识 * @param datasetId 数据集标识
* @param rules 规则列表 * @param rules 规则列表
*/ */
private void dealWithDatasetNodeData(List<DatasetData> dataList, Long datasetId, List<String> rules) { private List<DatasetData> dealWithDatasetNodeData(List<DatasetData> dataList,
Long datasetId, List<DatasetRule> rules) {
List<DatasetData> newDataList = new ArrayList<>();
try { try {
if(CollUtil.isNotEmpty(dataList)) { if(CollUtil.isNotEmpty(dataList)) {
ObjectMapper objectMapper = new ObjectMapper(); ObjectMapper objectMapper = new ObjectMapper();
...@@ -307,13 +339,19 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp ...@@ -307,13 +339,19 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
JsonNode rootNode = objectMapper.readTree(datasetData.getData()); JsonNode rootNode = objectMapper.readTree(datasetData.getData());
String data = rootNode.get(DatasetConstant.OUTPUT).textValue(); String data = rootNode.get(DatasetConstant.OUTPUT).textValue();
String output = DataCleanerUtil.buildCleanAfterData(data, rules); String output = DataCleanerUtil.buildCleanAfterData(data, rules);
//校验清洗后的数据是否满足条件,如果满足条件,则进行添加,否则直接移除。
if(StringUtils.isNotBlank(output)) {
datasetData.setData(createNewDataNode(datasetData.getData(), output)); datasetData.setData(createNewDataNode(datasetData.getData(), output));
newDataList.add(datasetData);
} }
this.datasetDataService.updateBatch(dataList, datasetId); }
this.datasetDataService.updateBatch(newDataList, datasetId);
} }
}catch (JsonProcessingException ex){ }catch (JsonProcessingException ex){
log.error("deal with dataset node data:", ex); log.error("deal with dataset node data:", ex);
} }
return newDataList;
} }
/** /**
...@@ -359,15 +397,22 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp ...@@ -359,15 +397,22 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
filter.setCleanId(cleanId); filter.setCleanId(cleanId);
DatasetCleanConfig cleanConfig = datasetCleanConfigService.getOne(filter); DatasetCleanConfig cleanConfig = datasetCleanConfigService.getOne(filter);
if(null != cleanConfig && null == cleanConfig.getCleanConfig()) return cleans; if(null != cleanConfig && null == cleanConfig.getCleanConfig()) return cleans;
List<String> rules = JsonNameExtractor.extractNames(cleanConfig.getCleanConfig());
if(CollUtil.isNotEmpty(dataList)) { if(CollUtil.isNotEmpty(dataList)) {
ObjectMapper objectMapper = new ObjectMapper(); ObjectMapper objectMapper = new ObjectMapper();
DatasetRule[] datasetRules = objectMapper.readValue(cleanConfig.getCleanConfig(), DatasetRule[].class);
List<DatasetRule> rules = Arrays.asList(datasetRules);
for(DatasetData datasetData: dataList) { for(DatasetData datasetData: dataList) {
DatasetDataClean dataClean = new DatasetDataClean(); DatasetDataClean dataClean = new DatasetDataClean();
JsonNode rootNode = objectMapper.readTree(datasetData.getData()); JsonNode rootNode = objectMapper.readTree(datasetData.getData());
String data = rootNode.get(DatasetConstant.OUTPUT).textValue(); String data = rootNode.get(DatasetConstant.OUTPUT).textValue();
dataClean.setCleanBeforeData(data); dataClean.setCleanBeforeData(data);
dataClean.setCleanAfterData(DataCleanerUtil.buildCleanAfterData(data,rules));
String cleanAfterData = DataCleanerUtil.buildCleanAfterData(data,rules);
if(StringUtils.isEmpty(cleanAfterData)) break;
dataClean.setCleanAfterData(cleanAfterData);
dataClean.setCleanId(cleanId); dataClean.setCleanId(cleanId);
dataClean.setCreateTime(new Date()); dataClean.setCreateTime(new Date());
cleans.add(dataClean); cleans.add(dataClean);
...@@ -391,17 +436,24 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp ...@@ -391,17 +436,24 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
try { try {
DatasetCleanConfig filter = new DatasetCleanConfig(); DatasetCleanConfig filter = new DatasetCleanConfig();
filter.setCleanId(cleanId); filter.setCleanId(cleanId);
DatasetCleanConfig cleanConfig = datasetCleanConfigService.getOne(filter); DatasetCleanConfig desenstiveCfg = datasetCleanConfigService.getOne(filter);
if(null != cleanConfig && null == cleanConfig.getDesensitiveConfig()) return desensitives; if(null != desenstiveCfg && null == desenstiveCfg.getDesensitiveConfig()) return desensitives;
List<String> rules = JsonNameExtractor.extractNames(cleanConfig.getDesensitiveConfig());
ObjectMapper objectMapper = new ObjectMapper();
if(CollUtil.isNotEmpty(dataList)) { if(CollUtil.isNotEmpty(dataList)) {
ObjectMapper objectMapper = new ObjectMapper();
DatasetRule[] datasets = objectMapper.readValue(desenstiveCfg.getDesensitiveConfig(), DatasetRule[].class);
List<DatasetRule> rules = Arrays.asList(datasets);
for(DatasetData datasetData: dataList) { for(DatasetData datasetData: dataList) {
DatasetDataDesensitive desensitive = new DatasetDataDesensitive(); DatasetDataDesensitive desensitive = new DatasetDataDesensitive();
JsonNode node = objectMapper.readTree(datasetData.getData()); JsonNode node = objectMapper.readTree(datasetData.getData());
String data = node.get(DatasetConstant.OUTPUT).textValue(); String data = node.get(DatasetConstant.OUTPUT).textValue();
String cleanAfterData = DataCleanerUtil.buildCleanAfterData(data,rules);
if(StringUtils.isEmpty(cleanAfterData)) break;
desensitive.setCleanBeforeData(data); desensitive.setCleanBeforeData(data);
desensitive.setCleanAfterData(DataCleanerUtil.buildCleanAfterData(data,rules)); desensitive.setCleanAfterData(cleanAfterData);
desensitive.setCleanId(cleanId); desensitive.setCleanId(cleanId);
desensitive.setCreateTime(new Date()); desensitive.setCreateTime(new Date());
desensitives.add(desensitive); desensitives.add(desensitive);
...@@ -427,15 +479,22 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp ...@@ -427,15 +479,22 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
filter.setCleanId(cleanId); filter.setCleanId(cleanId);
DatasetCleanConfig deduplicateConfig = datasetCleanConfigService.getOne(filter); DatasetCleanConfig deduplicateConfig = datasetCleanConfigService.getOne(filter);
if(null != deduplicateConfig && null == deduplicateConfig.getDeduplicateConfig()) return deduplicates; if(null != deduplicateConfig && null == deduplicateConfig.getDeduplicateConfig()) return deduplicates;
List<String> rules = JsonNameExtractor.extractNames(deduplicateConfig.getDeduplicateConfig());
ObjectMapper objectMapper = new ObjectMapper();
if(CollUtil.isNotEmpty(dataList)) { if(CollUtil.isNotEmpty(dataList)) {
ObjectMapper objectMapper = new ObjectMapper();
DatasetRule[] datasetRules = objectMapper.readValue(deduplicateConfig.getDeduplicateConfig(), DatasetRule[].class);
List<DatasetRule> rules = Arrays.asList(datasetRules);
for(DatasetData datasetData: dataList) { for(DatasetData datasetData: dataList) {
DatasetDataDeduplicate deduplicate = new DatasetDataDeduplicate(); DatasetDataDeduplicate deduplicate = new DatasetDataDeduplicate();
JsonNode jsonNode = objectMapper.readTree(datasetData.getData()); JsonNode jsonNode = objectMapper.readTree(datasetData.getData());
String data = jsonNode.get(DatasetConstant.OUTPUT).textValue(); String data = jsonNode.get(DatasetConstant.OUTPUT).textValue();
String cleanAfterData = DataCleanerUtil.buildCleanAfterData(data,rules);
if(StringUtils.isEmpty(cleanAfterData)) break;
deduplicate.setCleanBeforeData(data); deduplicate.setCleanBeforeData(data);
deduplicate.setCleanAfterData(DataCleanerUtil.buildCleanAfterData(data, rules)); deduplicate.setCleanAfterData(cleanAfterData);
deduplicate.setCleanId(cleanId); deduplicate.setCleanId(cleanId);
deduplicate.setCreateTime(new Date()); deduplicate.setCreateTime(new Date());
deduplicates.add(deduplicate); deduplicates.add(deduplicate);
...@@ -461,17 +520,24 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp ...@@ -461,17 +520,24 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
filter.setCleanId(cleanId); filter.setCleanId(cleanId);
DatasetCleanConfig filterConfig = datasetCleanConfigService.getOne(filter); DatasetCleanConfig filterConfig = datasetCleanConfigService.getOne(filter);
if(null != filterConfig && null == filterConfig.getFilterConfig()) return filters; if(null != filterConfig && null == filterConfig.getFilterConfig()) return filters;
List<String> rules = JsonNameExtractor.extractNames(filterConfig.getFilterConfig());
ObjectMapper objectMapper = new ObjectMapper();
if(CollUtil.isNotEmpty(dataList)) { if(CollUtil.isNotEmpty(dataList)) {
ObjectMapper objectMapper = new ObjectMapper();
DatasetRule[] datasetRules = objectMapper.readValue(filterConfig.getFilterConfig(), DatasetRule[].class);
List<DatasetRule> rules = Arrays.asList(datasetRules);
for(DatasetData datasetData: dataList) { for(DatasetData datasetData: dataList) {
DatasetDataFilter dataFilter = new DatasetDataFilter(); DatasetDataFilter dataFilter = new DatasetDataFilter();
dataFilter.setCleanId(cleanId); dataFilter.setCleanId(cleanId);
dataFilter.setCreateTime(new Date()); dataFilter.setCreateTime(new Date());
//TODO
JsonNode rootNode = objectMapper.readTree(datasetData.getData()); JsonNode rootNode = objectMapper.readTree(datasetData.getData());
String data = rootNode.get(DatasetConstant.OUTPUT).textValue(); String data = rootNode.get(DatasetConstant.OUTPUT).textValue();
dataFilter.setContent(data);
String cleanAfterData = DataCleanerUtil.buildCleanAfterData(data,rules);
if(StringUtils.isEmpty(cleanAfterData)) break;
dataFilter.setCleanBeforeData(data);
dataFilter.setCleanAfterData(cleanAfterData);
filters.add(dataFilter); filters.add(dataFilter);
} }
} }
......
...@@ -161,6 +161,7 @@ public class DatasetDataServiceImpl implements DatasetDataService { ...@@ -161,6 +161,7 @@ public class DatasetDataServiceImpl implements DatasetDataService {
public void updateBatch(List<DatasetData> dataList, Long versionId) { public void updateBatch(List<DatasetData> dataList, Long versionId) {
if(CollUtil.isNotEmpty(dataList)) { if(CollUtil.isNotEmpty(dataList)) {
for(DatasetData datasetData : dataList) { for(DatasetData datasetData : dataList) {
// 解析data字段的字符串为Document或Bson // 解析data字段的字符串为Document或Bson
Document dataDocument = Document.parse(datasetData.getData()); Document dataDocument = Document.parse(datasetData.getData());
......
package com.yice.webadmin.app.util; package com.yice.webadmin.app.util;
import com.github.houbb.opencc4j.util.ZhConverterUtil; import com.github.houbb.opencc4j.util.ZhConverterUtil;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.seg.common.Term;
import com.yice.webadmin.app.constant.DatasetCleanConstant; import com.yice.webadmin.app.constant.DatasetCleanConstant;
import com.yice.webadmin.app.constant.DatasetConstant;
import com.yice.webadmin.app.data.DatasetRule;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import java.util.List; import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 数据清洗工具类
*/
@Slf4j @Slf4j
public class DataCleanerUtil { public class DataCleanerUtil {
private static final Set<String> badWords = new HashSet<>();
static {
badWords.add("色情");
badWords.add("淫秽");
badWords.add("迷信");
badWords.add("黄色");
badWords.add("性行为");
badWords.add("暴力");
}
/** /**
* 定义清洗后的数据 * 定义清洗后的数据
* *
...@@ -16,17 +36,34 @@ public class DataCleanerUtil { ...@@ -16,17 +36,34 @@ public class DataCleanerUtil {
* @param rules 清洗规则 * @param rules 清洗规则
* @return 返回清洗后的数据 * @return 返回清洗后的数据
*/ */
public static String buildCleanAfterData(String data, List<String> rules) { public static String buildCleanAfterData(String data, List<DatasetRule> rules) {
for (String rule : rules) { StringBuilder sb = new StringBuilder();
for (DatasetRule rule : rules) {
if(rule.getArgs() > 0) {
data = buildJsonData(rule.getName(), data, rule.getArgs());
sb.append(data);
}
}
return sb.toString();
}
/**
* 新增过滤条件
* @param rule 规则
* @param data 清洗数据
* @param radio 阀值
* @return 清洗后的数据
*/
private static String buildJsonData(String rule, String data, double radio) {
switch (rule) { switch (rule) {
case DatasetCleanConstant.REMOVE_INVISIBLE_CHARACTER: case DatasetCleanConstant.REMOVE_INVISIBLE_CHARACTER:
data = data.replaceAll("[\\p{C}]", "&nbsp;"); data = data.replaceAll("[\\x00-\\x1F\\x7F-\\x9F]", "");
break; break;
case DatasetCleanConstant.REPLACE_UNIFORM_WHITESPACE: case DatasetCleanConstant.REPLACE_UNIFORM_WHITESPACE:
data = data.replaceAll("[\\p{Cs}\\p{Co}\\p{Cn}]", ""); data = data.replaceAll("[\\u0020\\u00A0\\u2000-\\u200A\\u2028\\u2029\\u3000]+", "");
break; break;
case DatasetCleanConstant.REMOVE_NON_MEANING_CHARACTERS: case DatasetCleanConstant.REMOVE_NON_MEANING_CHARACTERS:
data = data.replaceAll("[\\p{Z}\\u2000-\\u200A\\u2028\\u2029\\u3000]", ""); data = data.replaceAll("[\\p{Cntrl}\\p{Cn}]", "");
break; break;
case DatasetCleanConstant.REPLACE_TRADITIONAL_CHINESE_TO_SIMPLIFIED: case DatasetCleanConstant.REPLACE_TRADITIONAL_CHINESE_TO_SIMPLIFIED:
data = ZhConverterUtil.toSimple(data); data = ZhConverterUtil.toSimple(data);
...@@ -35,7 +72,7 @@ public class DataCleanerUtil { ...@@ -35,7 +72,7 @@ public class DataCleanerUtil {
data = data.replaceAll("<[^>]*>", ""); data = data.replaceAll("<[^>]*>", "");
break; break;
case DatasetCleanConstant.REMOVE_EMOJI: case DatasetCleanConstant.REMOVE_EMOJI:
data = data.replaceAll("[\\ud83c[\\udffb-\\udfff]|\\ud83d[\\udc00-\\ude4f]|\\ud83d[\\ude80-\\udeff]|\\ud83e[\\udd10-\\uddff]]", ""); data = data.replaceAll("[\\uE000-\\uF8FF]|\ud83c[\\ud000-\\udfff]|\ud83d[\\ud000-\\udfff]|\ud83e[\\ud000-\\udfff]", "");
break; break;
case DatasetCleanConstant.REPLACE_EMAILS: case DatasetCleanConstant.REPLACE_EMAILS:
data = data.replaceAll("\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b", "EMAIL"); data = data.replaceAll("\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b", "EMAIL");
...@@ -44,10 +81,213 @@ public class DataCleanerUtil { ...@@ -44,10 +81,213 @@ public class DataCleanerUtil {
data = data.replaceAll("\\b(25[0-5]\\.|2[0-4][0-9]\\.|[01]?[0-9][0-9]?\\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\b", "IP_ADDRESS"); data = data.replaceAll("\\b(25[0-5]\\.|2[0-4][0-9]\\.|[01]?[0-9][0-9]?\\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\b", "IP_ADDRESS");
break; break;
case DatasetCleanConstant.REPLACE_IDENTIFIER: case DatasetCleanConstant.REPLACE_IDENTIFIER:
data = data.replaceAll("\\d+", "PI:KEY"); data = data.replaceAll("\\d+", "");
break;
case DatasetCleanConstant.FILTER_CHECK_NUMBER_WORDS:
data = filterNumberWords(data, 0, (int)radio);
break;
case DatasetCleanConstant.FILTER_CHECK_WORD_REPETITION_REMOVAL:
data = filterWordRepetition(data, radio);
break;
case DatasetCleanConstant.FILTER_CHECK_CHARACTER_REPETITION_REMOVAL:
data = filterCharacterRepetition(data, radio);
break;
case DatasetCleanConstant.FILTER_CHECK_SPECIAL_CHARACTERS:
data = filterSpecialCharacters(data, radio);
break; break;
case DatasetCleanConstant.FILTER_CHECK_FLAGGED_WORDS:
data = filterCheckFlaggedWords(data, radio);
break;
}
return data;
}
public static void main(String [] args) {
String data = "我们是中国人,我们是地球人,我们要团结起来。团结就是力量,力量就是一切。";
data = filterWordRepetition(data,0.3);
System.out.println(data);
}
/**
* 计算阀值
* @param data 计算坏的数据
* @return 返回阀值
*/
private static double calculateBadWordRatio(String data) {
// 标准分词
List<Term> termList = HanLP.segment(data);
int badWordsCount = 0;
for (Term term : termList) {
if (badWords.contains(term.word.toLowerCase())) {
badWordsCount++;
}
}
return badWordsCount / (double) termList.size();
}
/**
* 检查文档的色情暴力词率
* @param data 过滤数据
* @param radio 阀值
* @return 清洗后的数据
*/
private static String filterCheckFlaggedWords(String data,double radio) {
StringBuffer result = new StringBuffer();
double badWordRatio = calculateBadWordRatio(data);
if (badWordRatio > radio) {
Pattern pattern = Pattern.compile("(" + String.join("|", badWords) + ")(?![\\w])");
Matcher matcher = pattern.matcher(data);
while (matcher.find()) {
//如果出现关键字符,则直接替换为空白字符
matcher.appendReplacement(result, "");
}
matcher.appendTail(result);
} else {
result.append(data);
}
return result.toString();
}
/**
* 检查文档的词重复率
* 计算词重复率并替换重复内容
* @param document 文档内容字符串
* @param threshold 阈值比率
* @return 处理后的文档内容,如果词重复率超过阈值,则重复内容被替换为空字符
*/
public static String filterCharacterRepetition(String document, double threshold) {
// 使用HanLP进行分词
List<Term> termList = HanLP.segment(document);
// 统计每个词的出现次数
Map<String, Integer> wordCountMap = new HashMap<>();
for (Term term : termList) {
String word = term.word;
wordCountMap.put(word, wordCountMap.getOrDefault(word, 0) + 1);
}
// 计算总词数
int totalWords = termList.size();
// 计算重复词的次数(即出现次数大于1的词的总次数)
int repeatedWordsCount = 0;
for (int count : wordCountMap.values()) {
if (count > 1) {
// 只计算重复的次数
repeatedWordsCount += count - 1;
} }
} }
// 计算词重复率
double repetitionRate = (double) repeatedWordsCount / totalWords;
return repetitionRate < threshold ? document : DatasetConstant.EMPTY_STR;
}
/**
* 检查文档的字重复率
* @param text 清洗数据
* @param threshold 阀值
* @return 清洗后的数据
*/
public static String filterWordRepetition(String text, double threshold) {
// 将文本转换为字符数组
char[] characters = text.toCharArray();
// 统计每个字的出现次数
Map<Character, Integer> characterCountMap = new HashMap<>();
for (char c : characters) {
characterCountMap.put(c, characterCountMap.getOrDefault(c, 0) + 1);
}
// 计算总字数
int totalCharacters = characters.length;
// 计算重复字的次数(即出现次数大于1的字的总次数)
int repeatedCharactersCount = 0;
for (int count : characterCountMap.values()) {
if (count > 1) {
// 只计算重复的次数
repeatedCharactersCount += count - 1;
}
}
// 计算字重复率
double repetitionRate = (double) repeatedCharactersCount / totalCharacters;
return repetitionRate < threshold ? text : DatasetConstant.EMPTY_STR;
}
/**
* 过滤检查文档的特殊字符率
* @param data 过滤数据
* @param radio 最大长度
* @return 返回截取后的字符串
*/
private static String filterSpecialCharacters(String data, double radio) {
StringBuffer result = new StringBuffer();
double specialCharacterRatio = calculateSpecialCharacterRatio(data);
if (specialCharacterRatio > radio) {
result.append(data.replaceAll("[#$%^&*()]", ""));
} else {
result.append(data);
}
return result.toString();
}
/**
* 计算特殊字符的阀值
* @param data 数据
* @return 阀值
*/
private static double calculateSpecialCharacterRatio(String data) {
List<Term> termList = HanLP.segment(data);
int specialCharactersCount = 0;
for (Term term : termList) {
if (term.word.matches(".*[#$%^&*()].*")) {
specialCharactersCount++;
}
}
return specialCharactersCount / (double) termList.size();
}
/**
* 过滤检查文档的词数目
* @param data 过滤数据
* @param radio 最大词条目数据
* @return 返回截取后的字符串
*/
public static String filterNumberWords(String data, int radio) {
data = data.toLowerCase();
if (data.length() <= radio) {
return data; return data;
} }
int startIndex = data.length() - radio;
return data.substring(startIndex);
}
/**
* 过滤检查文档的词数目
* @param content 过滤数据
* @param minWordCount 最小词条目数据
* @param maxWordCount 最大词条目数据
* @return 返回截取后的字符串
*/
private static String filterNumberWords(String content, int minWordCount, int maxWordCount) {
// 标准分词
List<Term> termList = HanLP.segment(content);
// 计算词的数量
int wordCount = termList.size();
// 检查词的数量是否在指定范围内
if (wordCount < minWordCount || wordCount > maxWordCount) {
// 如果词的数量不在范围内,返回空字符串
return DatasetConstant.EMPTY_STR;
}
// 如果词的数量在范围内,返回原始文本
return content;
}
} }
...@@ -2,6 +2,8 @@ package com.yice.webadmin.app.util; ...@@ -2,6 +2,8 @@ package com.yice.webadmin.app.util;
import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.yice.webadmin.app.constant.DatasetConstant;
import com.yice.webadmin.app.data.DatasetRule;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import java.io.IOException; import java.io.IOException;
...@@ -62,4 +64,24 @@ public class JsonNameExtractor { ...@@ -62,4 +64,24 @@ public class JsonNameExtractor {
return names; return names;
} }
/**
* 拼接名字
* @param rule json名字
* @return 名字列表
*/
public static DatasetRule buildRuleData(String rule) {
DatasetRule datasetRule = new DatasetRule();
try {
ObjectMapper objectMapper = new ObjectMapper();
JsonNode config = objectMapper.readTree(rule);
String args = config.get(DatasetConstant.ARGS).textValue();
datasetRule.setArgs(Double.valueOf(args));
String name = config.get(DatasetConstant.NAME).textValue();
datasetRule.setName(name);
} catch (IOException e) {
log.error("extract name method overload is error", e);
}
return datasetRule;
}
} }
...@@ -64,7 +64,7 @@ public class DatasetCleanVo { ...@@ -64,7 +64,7 @@ public class DatasetCleanVo {
/** /**
* 清洗状态。 * 清洗状态。
*/ */
@ApiModelProperty(value = "清洗状态") @ApiModelProperty(value = "清洗状态;0:未清洗;1:已清洗;2:暂停清洗")
private Integer cleanStatus; private Integer cleanStatus;
/** /**
......
...@@ -29,10 +29,16 @@ public class DatasetDataFilterVo { ...@@ -29,10 +29,16 @@ public class DatasetDataFilterVo {
private Long cleanId; private Long cleanId;
/** /**
* 过滤内容 * 清洗前数据
*/ */
@ApiModelProperty(value = "过滤内容") @ApiModelProperty(value = "清洗前数据")
private String content; private String cleanBeforeData;
/**
* 清洗后数据。
*/
@ApiModelProperty(value="清洗后数据")
private String cleanAfterData;
/** /**
* 创建时间。 * 创建时间。
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment