Commit 7f447ff0 authored by pengxin's avatar pengxin

数据集清洗新增开关模块。

parent cc809ff6
......@@ -70,6 +70,11 @@
<artifactId>opencc4j</artifactId>
<version>1.6.2</version>
</dependency>
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>portable-1.8.2</version>
</dependency>
</dependencies>
<build>
......
......@@ -50,4 +50,29 @@ public class DatasetCleanConstant {
*/
public static final String REPLACE_IDENTIFIER = "replace_identifier";
/**
* 检查文档的词数目
*/
public static final String FILTER_CHECK_NUMBER_WORDS = "filter_check_number_words";
/**
* 检查文档的字重复率
*/
public static final String FILTER_CHECK_WORD_REPETITION_REMOVAL = "filter_check_word_repetition_removal";
/**
* 检查文档的词重复率
*/
public static final String FILTER_CHECK_CHARACTER_REPETITION_REMOVAL = "filter_check_character_repetition_removal";
/**
* 检查文档的特殊字符率
*/
public static final String FILTER_CHECK_SPECIAL_CHARACTERS = "filter_check_special_characters";
/**
* 检查文档的色情暴力词率
*/
public static final String FILTER_CHECK_FLAGGED_WORDS = "filter_check_flagged_words";
}
......@@ -7,6 +7,11 @@ public class DatasetConstant {
*/
public static final Integer STATUS_UNPUBLISHED = 0;
/**
* 已导入
*/
public static final Integer INPUT_STATUS = 1;
/**
* 已发布状态
*/
......@@ -62,6 +67,21 @@ public class DatasetConstant {
*/
public static final String OUTPUT = "output";
/**
* args参数值
*/
public static final String ARGS = "args";
/**
* args参数值
*/
public static final String NAME = "name";
/**
* 关状态
*/
public static final String CLOSED = "0";
/**
* data数据
*/
......@@ -82,6 +102,16 @@ public class DatasetConstant {
*/
public static final Integer CLEAN_FINISHED = 1;
/**
* 暂停清洗
*/
public static final Integer PAUSE_FINISHED = 2;
/**
* 空白字符
*/
public static final String EMPTY_STR = "";
/**
* 文本数据清洗
*/
......
......@@ -362,7 +362,7 @@ public class DatasetVersionController {
//再存储数据集配置文件
datasetVersionService.saveDatasetInfo(versionName);
datasetVersion.setFileUrl(fullName);
datasetVersion.setInputStatus(1);
datasetVersion.setInputStatus(DatasetConstant.INPUT_STATUS);
datasetVersion.setDataVolume(Long.valueOf(JSON.parseArray(new String(importFile.getBytes(), StandardCharsets.UTF_8)).size()));
this.datasetVersionService.updateById(datasetVersion);
return ResponseResult.success();
......
......@@ -27,11 +27,17 @@ public class DatasetDataFilter {
@Field("clean_id")
private Long cleanId;
/**
* 清洗前数据
*/
@Field("clean_before_data")
private String cleanBeforeData;
/**
* 清洗后数据
*/
@Field("content")
private String content;
@Field("clean_after_data")
private String cleanAfterData;
/**
* 创建时间
......
package com.yice.webadmin.app.data;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
@AllArgsConstructor
public class DatasetRule {
/**
* 规则名称
*/
private String name;
/**
* 版本标识
*/
private double args;
@Override
public String toString() {
return "DatasetRule{" +
"args=" + args +
", name='" + name + '\'' +
'}';
}
}
......@@ -64,7 +64,7 @@ public class DatasetCleanDto {
/**
* 清洗状态。
*/
@ApiModelProperty(value = "清洗状态:0:进行中;1:已完成")
@ApiModelProperty(value = "清洗状态;0:未清洗;1:已清洗;2:暂停清洗")
private Integer cleanStatus;
/**
......
......@@ -29,10 +29,16 @@ public class DatasetDataFilterDto {
private Long cleanId;
/**
* 过滤内容
* 清洗前数据
*/
@ApiModelProperty(value = "过滤内容")
private String content;
@ApiModelProperty(value = "清洗前数据")
private String cleanBeforeData;
/**
* 清洗后数据。
*/
@ApiModelProperty(value="清洗后数据")
private String cleanAfterData;
/**
* 创建时间。
......
......@@ -244,7 +244,8 @@ public class DatasetCleanConfigServiceImpl extends BaseService<DatasetCleanConfi
List<Document> documents = new ArrayList<>();
if(CollUtil.isNotEmpty(filters)) {
for(DatasetDataFilter filter : filters) {
Document document = new Document(MongoConstant.CONTENT, filter.getContent())
Document document = new Document(MongoConstant.CLEAN_BEFORE_DATA, filter.getCleanBeforeData())
.append(MongoConstant.CLEAN_AFTER_DATA, filter.getCleanAfterData())
.append(MongoConstant.CLEAN_ID, filter.getCleanId())
.append(MongoConstant.CREATE_TIME, new Date());
documents.add(document);
......
......@@ -27,6 +27,7 @@ import com.yice.webadmin.app.service.DatasetVersionService;
import com.yice.webadmin.app.util.DataCleanerUtil;
import com.yice.webadmin.app.util.JsonNameExtractor;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Async;
import org.springframework.scheduling.annotation.AsyncResult;
......@@ -37,6 +38,7 @@ import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.concurrent.ConcurrentHashMap;
......@@ -176,6 +178,13 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
Future<?> future = futures.remove(cleanId);
if (future != null && !future.isDone()) {
future.cancel(true);
//暂停清洗
DatasetClean filter = new DatasetClean();
filter.setCleanStatus(DatasetConstant.PAUSE_FINISHED);
filter.setFinishTime(null);
filter.setCleanId(cleanId);
this.updateById(filter);
}
}
......@@ -204,42 +213,63 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
*/
private void dealWithTaskHandler(Long datasetId, Long cleanId) {
try {
List<String> rules = new ArrayList<>();
DatasetCleanConfig filter = new DatasetCleanConfig();
filter.setCleanId(cleanId);
DatasetCleanConfig config = datasetCleanConfigService.getOne(filter);
if(null != config) {
rules.add(config.getFilterConfig());
rules.add(config.getDesensitiveConfig());
rules.add(config.getDeduplicateConfig());
rules.add(config.getCleanConfig());
rules = rules.stream()
.filter(rule -> rule != null && !rule.isEmpty())
.collect(Collectors.toList());
rules = JsonNameExtractor.extractNames(rules);
}
DatasetVersion datasetVersion = this.datasetVersionService.getById(datasetId);
datasetVersionService.saveDatasetInfo(datasetVersion.getVersionName());
clearFileDatasetData(datasetVersion.getFileUrl());
Long count = datasetDataService.count(datasetId);
int pageSize = DatasetConstant.MAX_SIZE;
int totalPages = (int) Math.ceil((double) count / pageSize);
MyPageParam param;
for (int i = 1; i <= totalPages; i++) {
param = new MyPageParam();
param.setPageNum(i);
param.setPageSize(pageSize);
List<DatasetData> dataList = datasetDataService.list(datasetId, param);
dealWithDatasetNodeData(dataList, datasetId, rules);
appendDataListToFile(datasetVersion.getFileUrl() ,dataList);
if (count > 0) {
List<DatasetRule> rules = buildRulesList(cleanId);
int pageSize = DatasetConstant.MAX_SIZE;
int totalPages = (int) Math.ceil((double) count / pageSize);
MyPageParam param;
for (int i = 1; i <= totalPages; i++) {
param = new MyPageParam();
param.setPageNum(i);
param.setPageSize(pageSize);
List<DatasetData> dataList = datasetDataService.list(datasetId, param);
List<DatasetData> newDataList = dealWithDatasetNodeData(dataList, datasetId, rules);
if(CollUtil.isNotEmpty(newDataList)) {
appendDataListToFile(datasetVersion.getFileUrl() ,newDataList);
}
}
}
} catch (Exception ex) {
log.error("deal with task handler is error:" , ex);
}
}
/**
* 构建规则列表
* @param cleanId 清洗标识
* @return 规则列表
*/
private List<DatasetRule> buildRulesList(Long cleanId) {
DatasetCleanConfig cleanConfig = new DatasetCleanConfig();
cleanConfig.setCleanId(cleanId);
DatasetCleanConfig datasetCleanConfig = datasetCleanConfigService.getOne(cleanConfig);
List<DatasetRule> rules = new ArrayList<>();
if(null != datasetCleanConfig) {
String[] jsonStrings = {datasetCleanConfig.getFilterConfig(),datasetCleanConfig.getDesensitiveConfig(),
datasetCleanConfig.getDesensitiveConfig(),datasetCleanConfig.getDeduplicateConfig(),
datasetCleanConfig.getCleanConfig()};
ObjectMapper objectMapper = new ObjectMapper();
rules = Arrays.stream(jsonStrings)
.map(jsonString -> {
try {
return objectMapper.readValue(jsonString, DatasetRule[].class);
} catch (JsonProcessingException e) {
log.error("json processing exception is error:", e);
return null;
}
})
.flatMap(Arrays::stream)
.collect(Collectors.toList());
}
return rules;
}
/**
* 第一个方法:清空文件
* @param filePath 文件地址
......@@ -299,7 +329,9 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
* @param datasetId 数据集标识
* @param rules 规则列表
*/
private void dealWithDatasetNodeData(List<DatasetData> dataList, Long datasetId, List<String> rules) {
private List<DatasetData> dealWithDatasetNodeData(List<DatasetData> dataList,
Long datasetId, List<DatasetRule> rules) {
List<DatasetData> newDataList = new ArrayList<>();
try {
if(CollUtil.isNotEmpty(dataList)) {
ObjectMapper objectMapper = new ObjectMapper();
......@@ -307,13 +339,19 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
JsonNode rootNode = objectMapper.readTree(datasetData.getData());
String data = rootNode.get(DatasetConstant.OUTPUT).textValue();
String output = DataCleanerUtil.buildCleanAfterData(data, rules);
datasetData.setData(createNewDataNode(datasetData.getData(), output));
//校验清洗后的数据是否满足条件,如果满足条件,则进行添加,否则直接移除。
if(StringUtils.isNotBlank(output)) {
datasetData.setData(createNewDataNode(datasetData.getData(), output));
newDataList.add(datasetData);
}
}
this.datasetDataService.updateBatch(dataList, datasetId);
this.datasetDataService.updateBatch(newDataList, datasetId);
}
}catch (JsonProcessingException ex){
log.error("deal with dataset node data:", ex);
}
return newDataList;
}
/**
......@@ -359,15 +397,22 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
filter.setCleanId(cleanId);
DatasetCleanConfig cleanConfig = datasetCleanConfigService.getOne(filter);
if(null != cleanConfig && null == cleanConfig.getCleanConfig()) return cleans;
List<String> rules = JsonNameExtractor.extractNames(cleanConfig.getCleanConfig());
if(CollUtil.isNotEmpty(dataList)) {
ObjectMapper objectMapper = new ObjectMapper();
DatasetRule[] datasetRules = objectMapper.readValue(cleanConfig.getCleanConfig(), DatasetRule[].class);
List<DatasetRule> rules = Arrays.asList(datasetRules);
for(DatasetData datasetData: dataList) {
DatasetDataClean dataClean = new DatasetDataClean();
JsonNode rootNode = objectMapper.readTree(datasetData.getData());
String data = rootNode.get(DatasetConstant.OUTPUT).textValue();
dataClean.setCleanBeforeData(data);
dataClean.setCleanAfterData(DataCleanerUtil.buildCleanAfterData(data,rules));
String cleanAfterData = DataCleanerUtil.buildCleanAfterData(data,rules);
if(StringUtils.isEmpty(cleanAfterData)) break;
dataClean.setCleanAfterData(cleanAfterData);
dataClean.setCleanId(cleanId);
dataClean.setCreateTime(new Date());
cleans.add(dataClean);
......@@ -391,17 +436,24 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
try {
DatasetCleanConfig filter = new DatasetCleanConfig();
filter.setCleanId(cleanId);
DatasetCleanConfig cleanConfig = datasetCleanConfigService.getOne(filter);
if(null != cleanConfig && null == cleanConfig.getDesensitiveConfig()) return desensitives;
List<String> rules = JsonNameExtractor.extractNames(cleanConfig.getDesensitiveConfig());
ObjectMapper objectMapper = new ObjectMapper();
DatasetCleanConfig desenstiveCfg = datasetCleanConfigService.getOne(filter);
if(null != desenstiveCfg && null == desenstiveCfg.getDesensitiveConfig()) return desensitives;
if(CollUtil.isNotEmpty(dataList)) {
ObjectMapper objectMapper = new ObjectMapper();
DatasetRule[] datasets = objectMapper.readValue(desenstiveCfg.getDesensitiveConfig(), DatasetRule[].class);
List<DatasetRule> rules = Arrays.asList(datasets);
for(DatasetData datasetData: dataList) {
DatasetDataDesensitive desensitive = new DatasetDataDesensitive();
JsonNode node = objectMapper.readTree(datasetData.getData());
String data = node.get(DatasetConstant.OUTPUT).textValue();
String cleanAfterData = DataCleanerUtil.buildCleanAfterData(data,rules);
if(StringUtils.isEmpty(cleanAfterData)) break;
desensitive.setCleanBeforeData(data);
desensitive.setCleanAfterData(DataCleanerUtil.buildCleanAfterData(data,rules));
desensitive.setCleanAfterData(cleanAfterData);
desensitive.setCleanId(cleanId);
desensitive.setCreateTime(new Date());
desensitives.add(desensitive);
......@@ -427,15 +479,22 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
filter.setCleanId(cleanId);
DatasetCleanConfig deduplicateConfig = datasetCleanConfigService.getOne(filter);
if(null != deduplicateConfig && null == deduplicateConfig.getDeduplicateConfig()) return deduplicates;
List<String> rules = JsonNameExtractor.extractNames(deduplicateConfig.getDeduplicateConfig());
ObjectMapper objectMapper = new ObjectMapper();
if(CollUtil.isNotEmpty(dataList)) {
ObjectMapper objectMapper = new ObjectMapper();
DatasetRule[] datasetRules = objectMapper.readValue(deduplicateConfig.getDeduplicateConfig(), DatasetRule[].class);
List<DatasetRule> rules = Arrays.asList(datasetRules);
for(DatasetData datasetData: dataList) {
DatasetDataDeduplicate deduplicate = new DatasetDataDeduplicate();
JsonNode jsonNode = objectMapper.readTree(datasetData.getData());
String data = jsonNode.get(DatasetConstant.OUTPUT).textValue();
String cleanAfterData = DataCleanerUtil.buildCleanAfterData(data,rules);
if(StringUtils.isEmpty(cleanAfterData)) break;
deduplicate.setCleanBeforeData(data);
deduplicate.setCleanAfterData(DataCleanerUtil.buildCleanAfterData(data, rules));
deduplicate.setCleanAfterData(cleanAfterData);
deduplicate.setCleanId(cleanId);
deduplicate.setCreateTime(new Date());
deduplicates.add(deduplicate);
......@@ -461,17 +520,24 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
filter.setCleanId(cleanId);
DatasetCleanConfig filterConfig = datasetCleanConfigService.getOne(filter);
if(null != filterConfig && null == filterConfig.getFilterConfig()) return filters;
List<String> rules = JsonNameExtractor.extractNames(filterConfig.getFilterConfig());
ObjectMapper objectMapper = new ObjectMapper();
if(CollUtil.isNotEmpty(dataList)) {
ObjectMapper objectMapper = new ObjectMapper();
DatasetRule[] datasetRules = objectMapper.readValue(filterConfig.getFilterConfig(), DatasetRule[].class);
List<DatasetRule> rules = Arrays.asList(datasetRules);
for(DatasetData datasetData: dataList) {
DatasetDataFilter dataFilter = new DatasetDataFilter();
dataFilter.setCleanId(cleanId);
dataFilter.setCreateTime(new Date());
//TODO
JsonNode rootNode = objectMapper.readTree(datasetData.getData());
String data = rootNode.get(DatasetConstant.OUTPUT).textValue();
dataFilter.setContent(data);
String cleanAfterData = DataCleanerUtil.buildCleanAfterData(data,rules);
if(StringUtils.isEmpty(cleanAfterData)) break;
dataFilter.setCleanBeforeData(data);
dataFilter.setCleanAfterData(cleanAfterData);
filters.add(dataFilter);
}
}
......
......@@ -161,6 +161,7 @@ public class DatasetDataServiceImpl implements DatasetDataService {
public void updateBatch(List<DatasetData> dataList, Long versionId) {
if(CollUtil.isNotEmpty(dataList)) {
for(DatasetData datasetData : dataList) {
// 解析data字段的字符串为Document或Bson
Document dataDocument = Document.parse(datasetData.getData());
......
package com.yice.webadmin.app.util;
import com.github.houbb.opencc4j.util.ZhConverterUtil;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.seg.common.Term;
import com.yice.webadmin.app.constant.DatasetCleanConstant;
import com.yice.webadmin.app.constant.DatasetConstant;
import com.yice.webadmin.app.data.DatasetRule;
import lombok.extern.slf4j.Slf4j;
import java.util.List;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 数据清洗工具类
*/
@Slf4j
public class DataCleanerUtil {
private static final Set<String> badWords = new HashSet<>();
static {
badWords.add("色情");
badWords.add("淫秽");
badWords.add("迷信");
badWords.add("黄色");
badWords.add("性行为");
badWords.add("暴力");
}
/**
* 定义清洗后的数据
*
......@@ -16,38 +36,258 @@ public class DataCleanerUtil {
* @param rules 清洗规则
* @return 返回清洗后的数据
*/
public static String buildCleanAfterData(String data, List<String> rules) {
for (String rule : rules) {
switch (rule) {
case DatasetCleanConstant.REMOVE_INVISIBLE_CHARACTER:
data = data.replaceAll("[\\p{C}]", "&nbsp;");
break;
case DatasetCleanConstant.REPLACE_UNIFORM_WHITESPACE:
data = data.replaceAll("[\\p{Cs}\\p{Co}\\p{Cn}]", "");
break;
case DatasetCleanConstant.REMOVE_NON_MEANING_CHARACTERS:
data = data.replaceAll("[\\p{Z}\\u2000-\\u200A\\u2028\\u2029\\u3000]", "");
break;
case DatasetCleanConstant.REPLACE_TRADITIONAL_CHINESE_TO_SIMPLIFIED:
data = ZhConverterUtil.toSimple(data);
break;
case DatasetCleanConstant.REMOVE_WEB_IDENTIFIERS:
data = data.replaceAll("<[^>]*>", "");
break;
case DatasetCleanConstant.REMOVE_EMOJI:
data = data.replaceAll("[\\ud83c[\\udffb-\\udfff]|\\ud83d[\\udc00-\\ude4f]|\\ud83d[\\ude80-\\udeff]|\\ud83e[\\udd10-\\uddff]]", "");
break;
case DatasetCleanConstant.REPLACE_EMAILS:
data = data.replaceAll("\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b", "EMAIL");
break;
case DatasetCleanConstant.REPLACE_IP:
data = data.replaceAll("\\b(25[0-5]\\.|2[0-4][0-9]\\.|[01]?[0-9][0-9]?\\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\b", "IP_ADDRESS");
break;
case DatasetCleanConstant.REPLACE_IDENTIFIER:
data = data.replaceAll("\\d+", "PI:KEY");
break;
public static String buildCleanAfterData(String data, List<DatasetRule> rules) {
StringBuilder sb = new StringBuilder();
for (DatasetRule rule : rules) {
if(rule.getArgs() > 0) {
data = buildJsonData(rule.getName(), data, rule.getArgs());
sb.append(data);
}
}
return sb.toString();
}
/**
* 新增过滤条件
* @param rule 规则
* @param data 清洗数据
* @param radio 阀值
* @return 清洗后的数据
*/
private static String buildJsonData(String rule, String data, double radio) {
switch (rule) {
case DatasetCleanConstant.REMOVE_INVISIBLE_CHARACTER:
data = data.replaceAll("[\\x00-\\x1F\\x7F-\\x9F]", "");
break;
case DatasetCleanConstant.REPLACE_UNIFORM_WHITESPACE:
data = data.replaceAll("[\\u0020\\u00A0\\u2000-\\u200A\\u2028\\u2029\\u3000]+", "");
break;
case DatasetCleanConstant.REMOVE_NON_MEANING_CHARACTERS:
data = data.replaceAll("[\\p{Cntrl}\\p{Cn}]", "");
break;
case DatasetCleanConstant.REPLACE_TRADITIONAL_CHINESE_TO_SIMPLIFIED:
data = ZhConverterUtil.toSimple(data);
break;
case DatasetCleanConstant.REMOVE_WEB_IDENTIFIERS:
data = data.replaceAll("<[^>]*>", "");
break;
case DatasetCleanConstant.REMOVE_EMOJI:
data = data.replaceAll("[\\uE000-\\uF8FF]|\ud83c[\\ud000-\\udfff]|\ud83d[\\ud000-\\udfff]|\ud83e[\\ud000-\\udfff]", "");
break;
case DatasetCleanConstant.REPLACE_EMAILS:
data = data.replaceAll("\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b", "EMAIL");
break;
case DatasetCleanConstant.REPLACE_IP:
data = data.replaceAll("\\b(25[0-5]\\.|2[0-4][0-9]\\.|[01]?[0-9][0-9]?\\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\b", "IP_ADDRESS");
break;
case DatasetCleanConstant.REPLACE_IDENTIFIER:
data = data.replaceAll("\\d+", "");
break;
case DatasetCleanConstant.FILTER_CHECK_NUMBER_WORDS:
data = filterNumberWords(data, 0, (int)radio);
break;
case DatasetCleanConstant.FILTER_CHECK_WORD_REPETITION_REMOVAL:
data = filterWordRepetition(data, radio);
break;
case DatasetCleanConstant.FILTER_CHECK_CHARACTER_REPETITION_REMOVAL:
data = filterCharacterRepetition(data, radio);
break;
case DatasetCleanConstant.FILTER_CHECK_SPECIAL_CHARACTERS:
data = filterSpecialCharacters(data, radio);
break;
case DatasetCleanConstant.FILTER_CHECK_FLAGGED_WORDS:
data = filterCheckFlaggedWords(data, radio);
break;
}
return data;
}
public static void main(String [] args) {
String data = "我们是中国人,我们是地球人,我们要团结起来。团结就是力量,力量就是一切。";
data = filterWordRepetition(data,0.3);
System.out.println(data);
}
/**
* 计算阀值
* @param data 计算坏的数据
* @return 返回阀值
*/
private static double calculateBadWordRatio(String data) {
// 标准分词
List<Term> termList = HanLP.segment(data);
int badWordsCount = 0;
for (Term term : termList) {
if (badWords.contains(term.word.toLowerCase())) {
badWordsCount++;
}
}
return badWordsCount / (double) termList.size();
}
/**
* 检查文档的色情暴力词率
* @param data 过滤数据
* @param radio 阀值
* @return 清洗后的数据
*/
private static String filterCheckFlaggedWords(String data,double radio) {
StringBuffer result = new StringBuffer();
double badWordRatio = calculateBadWordRatio(data);
if (badWordRatio > radio) {
Pattern pattern = Pattern.compile("(" + String.join("|", badWords) + ")(?![\\w])");
Matcher matcher = pattern.matcher(data);
while (matcher.find()) {
//如果出现关键字符,则直接替换为空白字符
matcher.appendReplacement(result, "");
}
matcher.appendTail(result);
} else {
result.append(data);
}
return result.toString();
}
/**
* 检查文档的词重复率
* 计算词重复率并替换重复内容
* @param document 文档内容字符串
* @param threshold 阈值比率
* @return 处理后的文档内容,如果词重复率超过阈值,则重复内容被替换为空字符
*/
public static String filterCharacterRepetition(String document, double threshold) {
// 使用HanLP进行分词
List<Term> termList = HanLP.segment(document);
// 统计每个词的出现次数
Map<String, Integer> wordCountMap = new HashMap<>();
for (Term term : termList) {
String word = term.word;
wordCountMap.put(word, wordCountMap.getOrDefault(word, 0) + 1);
}
// 计算总词数
int totalWords = termList.size();
// 计算重复词的次数(即出现次数大于1的词的总次数)
int repeatedWordsCount = 0;
for (int count : wordCountMap.values()) {
if (count > 1) {
// 只计算重复的次数
repeatedWordsCount += count - 1;
}
}
// 计算词重复率
double repetitionRate = (double) repeatedWordsCount / totalWords;
return repetitionRate < threshold ? document : DatasetConstant.EMPTY_STR;
}
/**
* 检查文档的字重复率
* @param text 清洗数据
* @param threshold 阀值
* @return 清洗后的数据
*/
public static String filterWordRepetition(String text, double threshold) {
// 将文本转换为字符数组
char[] characters = text.toCharArray();
// 统计每个字的出现次数
Map<Character, Integer> characterCountMap = new HashMap<>();
for (char c : characters) {
characterCountMap.put(c, characterCountMap.getOrDefault(c, 0) + 1);
}
// 计算总字数
int totalCharacters = characters.length;
// 计算重复字的次数(即出现次数大于1的字的总次数)
int repeatedCharactersCount = 0;
for (int count : characterCountMap.values()) {
if (count > 1) {
// 只计算重复的次数
repeatedCharactersCount += count - 1;
}
}
// 计算字重复率
double repetitionRate = (double) repeatedCharactersCount / totalCharacters;
return repetitionRate < threshold ? text : DatasetConstant.EMPTY_STR;
}
/**
* 过滤检查文档的特殊字符率
* @param data 过滤数据
* @param radio 最大长度
* @return 返回截取后的字符串
*/
private static String filterSpecialCharacters(String data, double radio) {
StringBuffer result = new StringBuffer();
double specialCharacterRatio = calculateSpecialCharacterRatio(data);
if (specialCharacterRatio > radio) {
result.append(data.replaceAll("[#$%^&*()]", ""));
} else {
result.append(data);
}
return result.toString();
}
/**
* 计算特殊字符的阀值
* @param data 数据
* @return 阀值
*/
private static double calculateSpecialCharacterRatio(String data) {
List<Term> termList = HanLP.segment(data);
int specialCharactersCount = 0;
for (Term term : termList) {
if (term.word.matches(".*[#$%^&*()].*")) {
specialCharactersCount++;
}
}
return specialCharactersCount / (double) termList.size();
}
/**
* 过滤检查文档的词数目
* @param data 过滤数据
* @param radio 最大词条目数据
* @return 返回截取后的字符串
*/
public static String filterNumberWords(String data, int radio) {
data = data.toLowerCase();
if (data.length() <= radio) {
return data;
}
int startIndex = data.length() - radio;
return data.substring(startIndex);
}
/**
* 过滤检查文档的词数目
* @param content 过滤数据
* @param minWordCount 最小词条目数据
* @param maxWordCount 最大词条目数据
* @return 返回截取后的字符串
*/
private static String filterNumberWords(String content, int minWordCount, int maxWordCount) {
// 标准分词
List<Term> termList = HanLP.segment(content);
// 计算词的数量
int wordCount = termList.size();
// 检查词的数量是否在指定范围内
if (wordCount < minWordCount || wordCount > maxWordCount) {
// 如果词的数量不在范围内,返回空字符串
return DatasetConstant.EMPTY_STR;
}
// 如果词的数量在范围内,返回原始文本
return content;
}
}
......@@ -2,6 +2,8 @@ package com.yice.webadmin.app.util;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.yice.webadmin.app.constant.DatasetConstant;
import com.yice.webadmin.app.data.DatasetRule;
import lombok.extern.slf4j.Slf4j;
import java.io.IOException;
......@@ -62,4 +64,24 @@ public class JsonNameExtractor {
return names;
}
/**
* 拼接名字
* @param rule json名字
* @return 名字列表
*/
public static DatasetRule buildRuleData(String rule) {
DatasetRule datasetRule = new DatasetRule();
try {
ObjectMapper objectMapper = new ObjectMapper();
JsonNode config = objectMapper.readTree(rule);
String args = config.get(DatasetConstant.ARGS).textValue();
datasetRule.setArgs(Double.valueOf(args));
String name = config.get(DatasetConstant.NAME).textValue();
datasetRule.setName(name);
} catch (IOException e) {
log.error("extract name method overload is error", e);
}
return datasetRule;
}
}
......@@ -64,7 +64,7 @@ public class DatasetCleanVo {
/**
* 清洗状态。
*/
@ApiModelProperty(value = "清洗状态")
@ApiModelProperty(value = "清洗状态;0:未清洗;1:已清洗;2:暂停清洗")
private Integer cleanStatus;
/**
......
......@@ -29,10 +29,16 @@ public class DatasetDataFilterVo {
private Long cleanId;
/**
* 过滤内容
* 清洗前数据
*/
@ApiModelProperty(value = "过滤内容")
private String content;
@ApiModelProperty(value = "清洗前数据")
private String cleanBeforeData;
/**
* 清洗后数据。
*/
@ApiModelProperty(value="清洗后数据")
private String cleanAfterData;
/**
* 创建时间。
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment