Commit 7f447ff0 authored by pengxin's avatar pengxin

数据集清洗新增开关模块。

parent cc809ff6
...@@ -70,6 +70,11 @@ ...@@ -70,6 +70,11 @@
<artifactId>opencc4j</artifactId> <artifactId>opencc4j</artifactId>
<version>1.6.2</version> <version>1.6.2</version>
</dependency> </dependency>
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>portable-1.8.2</version>
</dependency>
</dependencies> </dependencies>
<build> <build>
......
...@@ -50,4 +50,29 @@ public class DatasetCleanConstant { ...@@ -50,4 +50,29 @@ public class DatasetCleanConstant {
*/ */
public static final String REPLACE_IDENTIFIER = "replace_identifier"; public static final String REPLACE_IDENTIFIER = "replace_identifier";
/**
* 检查文档的词数目
*/
public static final String FILTER_CHECK_NUMBER_WORDS = "filter_check_number_words";
/**
* 检查文档的字重复率
*/
public static final String FILTER_CHECK_WORD_REPETITION_REMOVAL = "filter_check_word_repetition_removal";
/**
* 检查文档的词重复率
*/
public static final String FILTER_CHECK_CHARACTER_REPETITION_REMOVAL = "filter_check_character_repetition_removal";
/**
* 检查文档的特殊字符率
*/
public static final String FILTER_CHECK_SPECIAL_CHARACTERS = "filter_check_special_characters";
/**
* 检查文档的色情暴力词率
*/
public static final String FILTER_CHECK_FLAGGED_WORDS = "filter_check_flagged_words";
} }
...@@ -7,6 +7,11 @@ public class DatasetConstant { ...@@ -7,6 +7,11 @@ public class DatasetConstant {
*/ */
public static final Integer STATUS_UNPUBLISHED = 0; public static final Integer STATUS_UNPUBLISHED = 0;
/**
* 已导入
*/
public static final Integer INPUT_STATUS = 1;
/** /**
* 已发布状态 * 已发布状态
*/ */
...@@ -62,6 +67,21 @@ public class DatasetConstant { ...@@ -62,6 +67,21 @@ public class DatasetConstant {
*/ */
public static final String OUTPUT = "output"; public static final String OUTPUT = "output";
/**
* args参数值
*/
public static final String ARGS = "args";
/**
* args参数值
*/
public static final String NAME = "name";
/**
* 关状态
*/
public static final String CLOSED = "0";
/** /**
* data数据 * data数据
*/ */
...@@ -82,6 +102,16 @@ public class DatasetConstant { ...@@ -82,6 +102,16 @@ public class DatasetConstant {
*/ */
public static final Integer CLEAN_FINISHED = 1; public static final Integer CLEAN_FINISHED = 1;
/**
* 暂停清洗
*/
public static final Integer PAUSE_FINISHED = 2;
/**
* 空白字符
*/
public static final String EMPTY_STR = "";
/** /**
* 文本数据清洗 * 文本数据清洗
*/ */
......
...@@ -362,7 +362,7 @@ public class DatasetVersionController { ...@@ -362,7 +362,7 @@ public class DatasetVersionController {
//再存储数据集配置文件 //再存储数据集配置文件
datasetVersionService.saveDatasetInfo(versionName); datasetVersionService.saveDatasetInfo(versionName);
datasetVersion.setFileUrl(fullName); datasetVersion.setFileUrl(fullName);
datasetVersion.setInputStatus(1); datasetVersion.setInputStatus(DatasetConstant.INPUT_STATUS);
datasetVersion.setDataVolume(Long.valueOf(JSON.parseArray(new String(importFile.getBytes(), StandardCharsets.UTF_8)).size())); datasetVersion.setDataVolume(Long.valueOf(JSON.parseArray(new String(importFile.getBytes(), StandardCharsets.UTF_8)).size()));
this.datasetVersionService.updateById(datasetVersion); this.datasetVersionService.updateById(datasetVersion);
return ResponseResult.success(); return ResponseResult.success();
......
...@@ -27,11 +27,17 @@ public class DatasetDataFilter { ...@@ -27,11 +27,17 @@ public class DatasetDataFilter {
@Field("clean_id") @Field("clean_id")
private Long cleanId; private Long cleanId;
/**
* 清洗前数据
*/
@Field("clean_before_data")
private String cleanBeforeData;
/** /**
* 清洗后数据 * 清洗后数据
*/ */
@Field("content") @Field("clean_after_data")
private String content; private String cleanAfterData;
/** /**
* 创建时间 * 创建时间
......
package com.yice.webadmin.app.data;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
@AllArgsConstructor
public class DatasetRule {
/**
* 规则名称
*/
private String name;
/**
* 版本标识
*/
private double args;
@Override
public String toString() {
return "DatasetRule{" +
"args=" + args +
", name='" + name + '\'' +
'}';
}
}
...@@ -64,7 +64,7 @@ public class DatasetCleanDto { ...@@ -64,7 +64,7 @@ public class DatasetCleanDto {
/** /**
* 清洗状态。 * 清洗状态。
*/ */
@ApiModelProperty(value = "清洗状态:0:进行中;1:已完成") @ApiModelProperty(value = "清洗状态;0:未清洗;1:已清洗;2:暂停清洗")
private Integer cleanStatus; private Integer cleanStatus;
/** /**
......
...@@ -29,10 +29,16 @@ public class DatasetDataFilterDto { ...@@ -29,10 +29,16 @@ public class DatasetDataFilterDto {
private Long cleanId; private Long cleanId;
/** /**
* 过滤内容 * 清洗前数据
*/ */
@ApiModelProperty(value = "过滤内容") @ApiModelProperty(value = "清洗前数据")
private String content; private String cleanBeforeData;
/**
* 清洗后数据。
*/
@ApiModelProperty(value="清洗后数据")
private String cleanAfterData;
/** /**
* 创建时间。 * 创建时间。
......
...@@ -244,7 +244,8 @@ public class DatasetCleanConfigServiceImpl extends BaseService<DatasetCleanConfi ...@@ -244,7 +244,8 @@ public class DatasetCleanConfigServiceImpl extends BaseService<DatasetCleanConfi
List<Document> documents = new ArrayList<>(); List<Document> documents = new ArrayList<>();
if(CollUtil.isNotEmpty(filters)) { if(CollUtil.isNotEmpty(filters)) {
for(DatasetDataFilter filter : filters) { for(DatasetDataFilter filter : filters) {
Document document = new Document(MongoConstant.CONTENT, filter.getContent()) Document document = new Document(MongoConstant.CLEAN_BEFORE_DATA, filter.getCleanBeforeData())
.append(MongoConstant.CLEAN_AFTER_DATA, filter.getCleanAfterData())
.append(MongoConstant.CLEAN_ID, filter.getCleanId()) .append(MongoConstant.CLEAN_ID, filter.getCleanId())
.append(MongoConstant.CREATE_TIME, new Date()); .append(MongoConstant.CREATE_TIME, new Date());
documents.add(document); documents.add(document);
......
...@@ -161,6 +161,7 @@ public class DatasetDataServiceImpl implements DatasetDataService { ...@@ -161,6 +161,7 @@ public class DatasetDataServiceImpl implements DatasetDataService {
public void updateBatch(List<DatasetData> dataList, Long versionId) { public void updateBatch(List<DatasetData> dataList, Long versionId) {
if(CollUtil.isNotEmpty(dataList)) { if(CollUtil.isNotEmpty(dataList)) {
for(DatasetData datasetData : dataList) { for(DatasetData datasetData : dataList) {
// 解析data字段的字符串为Document或Bson // 解析data字段的字符串为Document或Bson
Document dataDocument = Document.parse(datasetData.getData()); Document dataDocument = Document.parse(datasetData.getData());
......
...@@ -2,6 +2,8 @@ package com.yice.webadmin.app.util; ...@@ -2,6 +2,8 @@ package com.yice.webadmin.app.util;
import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.yice.webadmin.app.constant.DatasetConstant;
import com.yice.webadmin.app.data.DatasetRule;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import java.io.IOException; import java.io.IOException;
...@@ -62,4 +64,24 @@ public class JsonNameExtractor { ...@@ -62,4 +64,24 @@ public class JsonNameExtractor {
return names; return names;
} }
/**
* 拼接名字
* @param rule json名字
* @return 名字列表
*/
public static DatasetRule buildRuleData(String rule) {
DatasetRule datasetRule = new DatasetRule();
try {
ObjectMapper objectMapper = new ObjectMapper();
JsonNode config = objectMapper.readTree(rule);
String args = config.get(DatasetConstant.ARGS).textValue();
datasetRule.setArgs(Double.valueOf(args));
String name = config.get(DatasetConstant.NAME).textValue();
datasetRule.setName(name);
} catch (IOException e) {
log.error("extract name method overload is error", e);
}
return datasetRule;
}
} }
...@@ -64,7 +64,7 @@ public class DatasetCleanVo { ...@@ -64,7 +64,7 @@ public class DatasetCleanVo {
/** /**
* 清洗状态。 * 清洗状态。
*/ */
@ApiModelProperty(value = "清洗状态") @ApiModelProperty(value = "清洗状态;0:未清洗;1:已清洗;2:暂停清洗")
private Integer cleanStatus; private Integer cleanStatus;
/** /**
......
...@@ -29,10 +29,16 @@ public class DatasetDataFilterVo { ...@@ -29,10 +29,16 @@ public class DatasetDataFilterVo {
private Long cleanId; private Long cleanId;
/** /**
* 过滤内容 * 清洗前数据
*/ */
@ApiModelProperty(value = "过滤内容") @ApiModelProperty(value = "清洗前数据")
private String content; private String cleanBeforeData;
/**
* 清洗后数据。
*/
@ApiModelProperty(value="清洗后数据")
private String cleanAfterData;
/** /**
* 创建时间。 * 创建时间。
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment