Commit 32843a49 authored by pengxin's avatar pengxin

清洗数据新增规则。

parent 56c625a3
......@@ -65,6 +65,11 @@
<artifactId>core</artifactId>
<version>3.4.1</version>
</dependency>
<dependency>
<groupId>com.github.houbb</groupId>
<artifactId>opencc4j</artifactId>
<version>1.6.2</version>
</dependency>
</dependencies>
<build>
......
package com.yice.webadmin.app.constant;
/**
* 数据清洗类常量类
*/
public class DatasetCleanConstant {
/**
* 移除不可见字符
*/
public static final String REMOVE_INVISIBLE_CHARACTER = "remove_invisible_character";
/**
* 规范化空格
*/
public static final String REPLACE_UNIFORM_WHITESPACE = "replace_uniform_whitespace";
/**
* 去除乱码
*/
public static final String REMOVE_NON_MEANING_CHARACTERS = "remove_non_meaning_characters";
/**
* 繁体转简体
*/
public static final String REPLACE_TRADITIONAL_CHINESE_TO_SIMPLIFIED = "replace_traditional_chinese_to_simplified";
/**
* 去除网页标识符
*/
public static final String REMOVE_WEB_IDENTIFIERS = "remove_web_identifiers";
/**
* 去除表情
*/
public static final String REMOVE_EMOJI = "remove_emoji";
/**
* 去除Email
*/
public static final String REPLACE_EMAILS = "replace_emails";
/**
* 去除IP地址
*/
public static final String REPLACE_IP = "replace_ip";
/**
* 去除数字
*/
public static final String REPLACE_IDENTIFIER = "replace_identifier";
}
......@@ -22,6 +22,11 @@ public class DatasetConstant {
*/
public static final Integer UNMARK = 0;
/**
* 默认单次写入10000条数据
*/
public static final Integer MAX_SIZE = 10000;
/**
* 已完成状态
*/
......@@ -57,6 +62,11 @@ public class DatasetConstant {
*/
public static final String OUTPUT = "output";
/**
* data数据
*/
public static final String DATA = "data";
/**
* 已标记
*/
......@@ -65,7 +75,17 @@ public class DatasetConstant {
/**
* 清洗中
*/
public static final Integer CLEAN_PROGRESS = 1;
public static final Integer CLEAN_PROGRESS = 0;
/**
* 清洗完成
*/
public static final Integer CLEAN_FINISHED = 1;
/**
* 文本数据清洗
*/
public static final Integer CLEAN_TYPE = 1;
/**
* 分页个数
......
......@@ -10,10 +10,8 @@ import com.yice.common.core.util.MyModelUtil;
import com.yice.common.core.util.MyPageUtil;
import com.yice.common.log.annotation.OperationLog;
import com.yice.common.log.model.constant.SysOperationLogType;
import com.yice.webadmin.app.dto.DatasetCleanConfigDto;
import com.yice.webadmin.app.dto.DatasetCleanDto;
import com.yice.webadmin.app.model.DatasetClean;
import com.yice.webadmin.app.model.DatasetCleanConfig;
import com.yice.webadmin.app.service.DatasetCleanService;
import com.yice.webadmin.app.vo.DatasetCleanVo;
import io.swagger.annotations.Api;
......@@ -63,21 +61,35 @@ public class DatasetCleanController {
* @param datasetCleanDto 新增对象。
* @return 应答结果对象,包含新增对象主键Id。
*/
@ApiOperationSupport(ignoreParameters = {"datasetCleanDto.cleanId","datasetCleanConfigDto.cleanConfigId"})
@ApiOperationSupport(ignoreParameters = {"datasetCleanDto.cleanId"})
@OperationLog(type = SysOperationLogType.ADD_ALL)
@PostMapping("/addAll")
public ResponseResult<DatasetClean> addAll(@MyRequestBody DatasetCleanDto datasetCleanDto,
@MyRequestBody DatasetCleanConfigDto datasetCleanConfigDto) {
@PostMapping("/startClean")
public ResponseResult<DatasetClean> startClean(@MyRequestBody DatasetCleanDto datasetCleanDto) {
String errorMessage = MyCommonUtil.getModelValidationError(datasetCleanDto, false);
if (errorMessage != null) {
return ResponseResult.error(ErrorCodeEnum.DATA_VALIDATED_FAILED, errorMessage);
}
DatasetClean datasetClean = MyModelUtil.copyTo(datasetCleanDto, DatasetClean.class);
DatasetCleanConfig datasetCleanConfig = MyModelUtil.copyTo(datasetCleanConfigDto, DatasetCleanConfig.class);
datasetClean = datasetCleanService.saveNew(datasetClean,datasetCleanConfig);
datasetClean = datasetCleanService.addNew(datasetClean);
return ResponseResult.success(datasetClean);
}
/**
* 停止数据集清洗数据。
*
* @param cleanId 新增对象。
* @return 应答结果对象,包含新增对象主键Id。
*/
@OperationLog(type = SysOperationLogType.DELETE)
@PostMapping("/stopClean")
public ResponseResult<Void> stopClean(@RequestParam Long cleanId) {
if (MyCommonUtil.existBlankArgument(cleanId)) {
return ResponseResult.error(ErrorCodeEnum.ARGUMENT_NULL_EXIST);
}
datasetCleanService.stopCleanTask(cleanId);
return ResponseResult.success();
}
/**
* 更新数据集清洗数据。
*
......
......@@ -356,7 +356,7 @@ public class DatasetVersionController {
return ResponseResult.error(ErrorCodeEnum.ARGUMENT_NULL_EXIST, errorMessage);
}
DatasetVersion datasetVersion = this.datasetVersionService.getById(versionId);
String versionName = datasetVersion.getVersionName();
String versionName = datasetVersion.getVersionName() + "_V" + datasetVersion.getDatasetVersion();
//先存储文件
String fullName = this.saveDatasetFile(importFile, versionName, versionId);
//再存储数据集配置文件
......
......@@ -24,8 +24,11 @@ public class DatasetDataDeduplicate {
@ApiModelProperty(name = "clean_id",value = "清洗任务标识id")
private Long cleanId;
@ApiModelProperty(name = "content",value = "去重内容")
private String content;
@ApiModelProperty(name = "clean_before_data",value = "清洗前数据")
private String cleanBeforeData;
@ApiModelProperty(name = "clean_after_data",value="清洗后数据")
private String cleanAfterData;
@ApiModelProperty(name = "create_time",value="创建时间")
private Date createTime;
......
......@@ -24,8 +24,11 @@ public class DatasetDataDesensitive {
@ApiModelProperty(name = "clean_id",value = "清洗任务标识id")
private Long cleanId;
@ApiModelProperty(name = "content",value = "去隐私内容")
private String content;
@ApiModelProperty(name = "clean_before_data",value = "清洗前数据")
private String cleanBeforeData;
@ApiModelProperty(name = "clean_after_data",value="清洗后数据")
private String cleanAfterData;
@ApiModelProperty(name = "create_time",value="创建时间")
private Date createTime;
......
package com.yice.webadmin.app.dto;
import com.yice.webadmin.app.model.DatasetCleanConfig;
import io.swagger.annotations.ApiModel;
import io.swagger.annotations.ApiModelProperty;
import lombok.Data;
......@@ -36,6 +37,12 @@ public class DatasetCleanDto {
@ApiModelProperty(value = "清洗数据集名称")
private String datasetName;
/**
* 清洗配置对象。
*/
@ApiModelProperty(value = "清洗配置对象")
private DatasetCleanConfig config;
/**
* 清洗方式。
*/
......
......@@ -29,10 +29,16 @@ public class DatasetDataDeduplicateDto {
private Long cleanId;
/**
* 过滤内容
* 清洗前数据
*/
@ApiModelProperty(value = "过滤内容")
private String content;
@ApiModelProperty(value = "清洗前数据")
private String cleanBeforeData;
/**
* 清洗后数据。
*/
@ApiModelProperty(value="清洗后数据")
private String cleanAfterData;
/**
* 创建时间。
......
......@@ -29,10 +29,16 @@ public class DatasetDataDesensitiveDto {
private Long cleanId;
/**
* 过滤内容
* 清洗前数据
*/
@ApiModelProperty(value = "过滤内容")
private String content;
@ApiModelProperty(value = "清洗前数据")
private String cleanBeforeData;
/**
* 清洗后数据。
*/
@ApiModelProperty(value="清洗后数据")
private String cleanAfterData;
/**
* 创建时间。
......
......@@ -80,6 +80,12 @@ public class DatasetClean extends BaseModel {
@TableField(exist = false)
private List<String> cleanMethod;
/**
* 清洗配置对象。
*/
@TableField(exist = false)
private DatasetCleanConfig config;
/**
* 创建人名称字典。
*/
......
......@@ -2,7 +2,6 @@ package com.yice.webadmin.app.service;
import com.yice.common.core.base.service.IBaseService;
import com.yice.webadmin.app.model.DatasetClean;
import com.yice.webadmin.app.model.DatasetCleanConfig;
import java.util.List;
......@@ -22,14 +21,19 @@ public interface DatasetCleanService extends IBaseService<DatasetClean, Long> {
*/
DatasetClean saveNew(DatasetClean datasetClean);
/**
* 停止清洗任务
* @param cleanId 清洗任务id
*/
void stopCleanTask(Long cleanId);
/**
* 保存清洗对象以及清洗配置对象。
*
* @param datasetClean 新增对象。
* @param datasetCleanConfig 新增配置对象。
* @return 返回新增对象。
*/
DatasetClean saveNew(DatasetClean datasetClean, DatasetCleanConfig datasetCleanConfig);
DatasetClean addNew(DatasetClean datasetClean);
/**
* 利用数据库的insertList语法,批量插入对象列表。
......
......@@ -78,6 +78,15 @@ public interface DatasetDataService {
*/
void update(DatasetData datasetData);
/**
* 批量处理数据集列表。
*
* @param dataList 批量处理数据集列表。
* @param versionId 版本标识
* @return 返回修改后的对象。
*/
void updateBatch(List<DatasetData> dataList, Long versionId);
/**
* 删除指定数据。
*
......
......@@ -264,7 +264,8 @@ public class DatasetCleanConfigServiceImpl extends BaseService<DatasetCleanConfi
List<Document> documents = new ArrayList<>();
if(CollUtil.isNotEmpty(deduplicates)) {
for(DatasetDataDeduplicate deduplicate : deduplicates) {
Document document = new Document(MongoConstant.CONTENT, deduplicate.getContent())
Document document = new Document(MongoConstant.CLEAN_BEFORE_DATA, deduplicate.getCleanBeforeData())
.append(MongoConstant.CLEAN_AFTER_DATA, deduplicate.getCleanAfterData())
.append(MongoConstant.CLEAN_ID, deduplicate.getCleanId())
.append(MongoConstant.CREATE_TIME, new Date());
documents.add(document);
......@@ -283,9 +284,10 @@ public class DatasetCleanConfigServiceImpl extends BaseService<DatasetCleanConfi
public void saveDatasetDesensitive(List<DatasetDataDesensitive> desensitives) {
List<Document> documents = new ArrayList<>();
if(CollUtil.isNotEmpty(desensitives)) {
for(DatasetDataDesensitive dataDesensitive : desensitives) {
Document document = new Document(MongoConstant.CONTENT, dataDesensitive.getContent())
.append(MongoConstant.CLEAN_ID, dataDesensitive.getCleanId())
for(DatasetDataDesensitive desensitive : desensitives) {
Document document = new Document(MongoConstant.CLEAN_BEFORE_DATA, desensitive.getCleanBeforeData())
.append(MongoConstant.CLEAN_AFTER_DATA, desensitive.getCleanAfterData())
.append(MongoConstant.CLEAN_ID, desensitive.getCleanId())
.append(MongoConstant.CREATE_TIME, new Date());
documents.add(document);
}
......
......@@ -148,6 +148,18 @@ public class DatasetDataServiceImpl implements DatasetDataService {
MongoConstant.COLLECT_NAME + datasetData.getVersionId());
}
/**
* 更新数据对象。
*
* @param dataList 更新的对象。
* @param versionId 更新的对象。
* @return 成功返回true,否则false。
*/
@Override
public void updateBatch(List<DatasetData> dataList, Long versionId) {
mongoTemplate.save(dataList, MongoConstant.COLLECT_NAME + versionId);
}
/**
* 删除指定数据。
*
......
package com.yice.webadmin.app.util;
import com.github.houbb.opencc4j.util.ZhConverterUtil;
import com.yice.webadmin.app.constant.DatasetCleanConstant;
import lombok.extern.slf4j.Slf4j;
import java.util.List;
@Slf4j
public class DataCleanerUtil {
/**
* 定义清洗后的数据
*
* @param data 清洗数据
* @param rules 清洗规则
* @return 返回清洗后的数据
*/
public static String buildCleanAfterData(String data, List<String> rules) {
for (String rule : rules) {
switch (rule) {
case DatasetCleanConstant.REMOVE_INVISIBLE_CHARACTER:
data = data.replaceAll("[\\p{C}]", "&nbsp;");
break;
case DatasetCleanConstant.REPLACE_UNIFORM_WHITESPACE:
data = data.replaceAll("[\\p{Cs}\\p{Co}\\p{Cn}]", "");
break;
case DatasetCleanConstant.REMOVE_NON_MEANING_CHARACTERS:
data = data.replaceAll("[\\p{Z}\\u2000-\\u200A\\u2028\\u2029\\u3000]", "");
break;
case DatasetCleanConstant.REPLACE_TRADITIONAL_CHINESE_TO_SIMPLIFIED:
data = ZhConverterUtil.toSimple(data);
break;
case DatasetCleanConstant.REMOVE_WEB_IDENTIFIERS:
data = data.replaceAll("<[^>]*>", "");
break;
case DatasetCleanConstant.REMOVE_EMOJI:
data = data.replaceAll("[\\ud83c[\\udffb-\\udfff]|\\ud83d[\\udc00-\\ude4f]|\\ud83d[\\ude80-\\udeff]|\\ud83e[\\udd10-\\uddff]]", "");
break;
case DatasetCleanConstant.REPLACE_EMAILS:
data = data.replaceAll("\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b", "EMAIL");
break;
case DatasetCleanConstant.REPLACE_IP:
data = data.replaceAll("\\b(25[0-5]\\.|2[0-4][0-9]\\.|[01]?[0-9][0-9]?\\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\b", "IP_ADDRESS");
break;
case DatasetCleanConstant.REPLACE_IDENTIFIER:
data = data.replaceAll("\\d+", "PI:KEY");
break;
}
}
return data;
}
}
......@@ -38,4 +38,28 @@ public class JsonNameExtractor {
return names;
}
/**
* 拼接名字
* @param rule json名字
* @return 名字列表
*/
public static List<String> extractNames(String rule) {
ObjectMapper mapper = new ObjectMapper();
List<String> names = new ArrayList<>();
try {
JsonNode rootNode = mapper.readTree(rule);
if (rootNode.isArray()) {
for (JsonNode jsonNode : rootNode) {
JsonNode nameNode = jsonNode.get("name");
if (nameNode != null && nameNode.isTextual()) {
names.add(nameNode.asText());
}
}
}
} catch (IOException e) {
log.error("extract name method overload is error", e);
}
return names;
}
}
package com.yice.webadmin.app.vo;
import com.yice.webadmin.app.model.DatasetCleanConfig;
import io.swagger.annotations.ApiModel;
import io.swagger.annotations.ApiModelProperty;
import lombok.Data;
......@@ -42,6 +43,12 @@ public class DatasetCleanVo {
@ApiModelProperty(value = "清洗数据集名称")
private String datasetName;
/**
* 清洗配置对象。
*/
@ApiModelProperty(value = "清洗配置对象")
private DatasetCleanConfig config;
/**
* 开始时间。
*/
......
......@@ -29,10 +29,16 @@ public class DatasetDataDeduplicateVo {
private Long cleanId;
/**
* 过滤内容
* 清洗前数据
*/
@ApiModelProperty(value = "过滤内容")
private String content;
@ApiModelProperty(value = "清洗前数据")
private String cleanBeforeData;
/**
* 清洗后数据。
*/
@ApiModelProperty(value="清洗后数据")
private String cleanAfterData;
/**
* 创建时间。
......
......@@ -29,10 +29,16 @@ public class DatasetDataDesensitiveVo {
private Long cleanId;
/**
* 过滤内容
* 清洗前数据
*/
@ApiModelProperty(value = "过滤内容")
private String content;
@ApiModelProperty(value = "清洗前数据")
private String cleanBeforeData;
/**
* 清洗后数据。
*/
@ApiModelProperty(value="清洗后数据")
private String cleanAfterData;
/**
* 创建时间。
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment