Commit 7f447ff0 authored by pengxin's avatar pengxin

数据集清洗新增开关模块。

parent cc809ff6
......@@ -70,6 +70,11 @@
<artifactId>opencc4j</artifactId>
<version>1.6.2</version>
</dependency>
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>portable-1.8.2</version>
</dependency>
</dependencies>
<build>
......
......@@ -50,4 +50,29 @@ public class DatasetCleanConstant {
*/
public static final String REPLACE_IDENTIFIER = "replace_identifier";
/**
* 检查文档的词数目
*/
public static final String FILTER_CHECK_NUMBER_WORDS = "filter_check_number_words";
/**
* 检查文档的字重复率
*/
public static final String FILTER_CHECK_WORD_REPETITION_REMOVAL = "filter_check_word_repetition_removal";
/**
* 检查文档的词重复率
*/
public static final String FILTER_CHECK_CHARACTER_REPETITION_REMOVAL = "filter_check_character_repetition_removal";
/**
* 检查文档的特殊字符率
*/
public static final String FILTER_CHECK_SPECIAL_CHARACTERS = "filter_check_special_characters";
/**
* 检查文档的色情暴力词率
*/
public static final String FILTER_CHECK_FLAGGED_WORDS = "filter_check_flagged_words";
}
......@@ -7,6 +7,11 @@ public class DatasetConstant {
*/
public static final Integer STATUS_UNPUBLISHED = 0;
/**
* 已导入
*/
public static final Integer INPUT_STATUS = 1;
/**
* 已发布状态
*/
......@@ -62,6 +67,21 @@ public class DatasetConstant {
*/
public static final String OUTPUT = "output";
/**
* args参数值
*/
public static final String ARGS = "args";
/**
* args参数值
*/
public static final String NAME = "name";
/**
* 关状态
*/
public static final String CLOSED = "0";
/**
* data数据
*/
......@@ -82,6 +102,16 @@ public class DatasetConstant {
*/
public static final Integer CLEAN_FINISHED = 1;
/**
* 暂停清洗
*/
public static final Integer PAUSE_FINISHED = 2;
/**
* 空白字符
*/
public static final String EMPTY_STR = "";
/**
* 文本数据清洗
*/
......
......@@ -362,7 +362,7 @@ public class DatasetVersionController {
//再存储数据集配置文件
datasetVersionService.saveDatasetInfo(versionName);
datasetVersion.setFileUrl(fullName);
datasetVersion.setInputStatus(1);
datasetVersion.setInputStatus(DatasetConstant.INPUT_STATUS);
datasetVersion.setDataVolume(Long.valueOf(JSON.parseArray(new String(importFile.getBytes(), StandardCharsets.UTF_8)).size()));
this.datasetVersionService.updateById(datasetVersion);
return ResponseResult.success();
......
......@@ -27,11 +27,17 @@ public class DatasetDataFilter {
@Field("clean_id")
private Long cleanId;
/**
* 清洗前数据
*/
@Field("clean_before_data")
private String cleanBeforeData;
/**
* 清洗后数据
*/
@Field("content")
private String content;
@Field("clean_after_data")
private String cleanAfterData;
/**
* 创建时间
......
package com.yice.webadmin.app.data;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
@AllArgsConstructor
public class DatasetRule {
/**
* 规则名称
*/
private String name;
/**
* 版本标识
*/
private double args;
@Override
public String toString() {
return "DatasetRule{" +
"args=" + args +
", name='" + name + '\'' +
'}';
}
}
......@@ -64,7 +64,7 @@ public class DatasetCleanDto {
/**
* 清洗状态。
*/
@ApiModelProperty(value = "清洗状态:0:进行中;1:已完成")
@ApiModelProperty(value = "清洗状态;0:未清洗;1:已清洗;2:暂停清洗")
private Integer cleanStatus;
/**
......
......@@ -29,10 +29,16 @@ public class DatasetDataFilterDto {
private Long cleanId;
/**
* 过滤内容
* 清洗前数据
*/
@ApiModelProperty(value = "过滤内容")
private String content;
@ApiModelProperty(value = "清洗前数据")
private String cleanBeforeData;
/**
* 清洗后数据。
*/
@ApiModelProperty(value="清洗后数据")
private String cleanAfterData;
/**
* 创建时间。
......
......@@ -244,7 +244,8 @@ public class DatasetCleanConfigServiceImpl extends BaseService<DatasetCleanConfi
List<Document> documents = new ArrayList<>();
if(CollUtil.isNotEmpty(filters)) {
for(DatasetDataFilter filter : filters) {
Document document = new Document(MongoConstant.CONTENT, filter.getContent())
Document document = new Document(MongoConstant.CLEAN_BEFORE_DATA, filter.getCleanBeforeData())
.append(MongoConstant.CLEAN_AFTER_DATA, filter.getCleanAfterData())
.append(MongoConstant.CLEAN_ID, filter.getCleanId())
.append(MongoConstant.CREATE_TIME, new Date());
documents.add(document);
......
......@@ -161,6 +161,7 @@ public class DatasetDataServiceImpl implements DatasetDataService {
public void updateBatch(List<DatasetData> dataList, Long versionId) {
if(CollUtil.isNotEmpty(dataList)) {
for(DatasetData datasetData : dataList) {
// 解析data字段的字符串为Document或Bson
Document dataDocument = Document.parse(datasetData.getData());
......
......@@ -2,6 +2,8 @@ package com.yice.webadmin.app.util;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.yice.webadmin.app.constant.DatasetConstant;
import com.yice.webadmin.app.data.DatasetRule;
import lombok.extern.slf4j.Slf4j;
import java.io.IOException;
......@@ -62,4 +64,24 @@ public class JsonNameExtractor {
return names;
}
/**
* 拼接名字
* @param rule json名字
* @return 名字列表
*/
public static DatasetRule buildRuleData(String rule) {
DatasetRule datasetRule = new DatasetRule();
try {
ObjectMapper objectMapper = new ObjectMapper();
JsonNode config = objectMapper.readTree(rule);
String args = config.get(DatasetConstant.ARGS).textValue();
datasetRule.setArgs(Double.valueOf(args));
String name = config.get(DatasetConstant.NAME).textValue();
datasetRule.setName(name);
} catch (IOException e) {
log.error("extract name method overload is error", e);
}
return datasetRule;
}
}
......@@ -64,7 +64,7 @@ public class DatasetCleanVo {
/**
* 清洗状态。
*/
@ApiModelProperty(value = "清洗状态")
@ApiModelProperty(value = "清洗状态;0:未清洗;1:已清洗;2:暂停清洗")
private Integer cleanStatus;
/**
......
......@@ -29,10 +29,16 @@ public class DatasetDataFilterVo {
private Long cleanId;
/**
* 过滤内容
* 清洗前数据
*/
@ApiModelProperty(value = "过滤内容")
private String content;
@ApiModelProperty(value = "清洗前数据")
private String cleanBeforeData;
/**
* 清洗后数据。
*/
@ApiModelProperty(value="清洗后数据")
private String cleanAfterData;
/**
* 创建时间。
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment