Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
L
lmp_server
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
lmp
lmp_server
Commits
7f447ff0
Commit
7f447ff0
authored
Apr 08, 2024
by
pengxin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
数据集清洗新增开关模块。
parent
cc809ff6
Changes
15
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
523 additions
and
85 deletions
+523
-85
pom.xml
application-webadmin/pom.xml
+5
-0
DatasetCleanConstant.java
.../com/yice/webadmin/app/constant/DatasetCleanConstant.java
+25
-0
DatasetConstant.java
.../java/com/yice/webadmin/app/constant/DatasetConstant.java
+30
-0
DatasetVersionController.java
...ice/webadmin/app/controller/DatasetVersionController.java
+1
-1
DatasetDataFilter.java
...in/java/com/yice/webadmin/app/data/DatasetDataFilter.java
+8
-2
DatasetRule.java
...src/main/java/com/yice/webadmin/app/data/DatasetRule.java
+30
-0
DatasetCleanDto.java
.../main/java/com/yice/webadmin/app/dto/DatasetCleanDto.java
+1
-1
DatasetDataFilterDto.java
.../java/com/yice/webadmin/app/dto/DatasetDataFilterDto.java
+9
-3
DatasetCleanConfigServiceImpl.java
...admin/app/service/impl/DatasetCleanConfigServiceImpl.java
+2
-1
DatasetCleanServiceImpl.java
...ce/webadmin/app/service/impl/DatasetCleanServiceImpl.java
+108
-42
DatasetDataServiceImpl.java
...ice/webadmin/app/service/impl/DatasetDataServiceImpl.java
+1
-0
DataCleanerUtil.java
...main/java/com/yice/webadmin/app/util/DataCleanerUtil.java
+271
-31
JsonNameExtractor.java
...in/java/com/yice/webadmin/app/util/JsonNameExtractor.java
+22
-0
DatasetCleanVo.java
...rc/main/java/com/yice/webadmin/app/vo/DatasetCleanVo.java
+1
-1
DatasetDataFilterVo.java
...in/java/com/yice/webadmin/app/vo/DatasetDataFilterVo.java
+9
-3
No files found.
application-webadmin/pom.xml
View file @
7f447ff0
...
...
@@ -70,6 +70,11 @@
<artifactId>
opencc4j
</artifactId>
<version>
1.6.2
</version>
</dependency>
<dependency>
<groupId>
com.hankcs
</groupId>
<artifactId>
hanlp
</artifactId>
<version>
portable-1.8.2
</version>
</dependency>
</dependencies>
<build>
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/constant/DatasetCleanConstant.java
View file @
7f447ff0
...
...
@@ -50,4 +50,29 @@ public class DatasetCleanConstant {
*/
public
static
final
String
REPLACE_IDENTIFIER
=
"replace_identifier"
;
/**
* 检查文档的词数目
*/
public
static
final
String
FILTER_CHECK_NUMBER_WORDS
=
"filter_check_number_words"
;
/**
* 检查文档的字重复率
*/
public
static
final
String
FILTER_CHECK_WORD_REPETITION_REMOVAL
=
"filter_check_word_repetition_removal"
;
/**
* 检查文档的词重复率
*/
public
static
final
String
FILTER_CHECK_CHARACTER_REPETITION_REMOVAL
=
"filter_check_character_repetition_removal"
;
/**
* 检查文档的特殊字符率
*/
public
static
final
String
FILTER_CHECK_SPECIAL_CHARACTERS
=
"filter_check_special_characters"
;
/**
* 检查文档的色情暴力词率
*/
public
static
final
String
FILTER_CHECK_FLAGGED_WORDS
=
"filter_check_flagged_words"
;
}
application-webadmin/src/main/java/com/yice/webadmin/app/constant/DatasetConstant.java
View file @
7f447ff0
...
...
@@ -7,6 +7,11 @@ public class DatasetConstant {
*/
public
static
final
Integer
STATUS_UNPUBLISHED
=
0
;
/**
* 已导入
*/
public
static
final
Integer
INPUT_STATUS
=
1
;
/**
* 已发布状态
*/
...
...
@@ -62,6 +67,21 @@ public class DatasetConstant {
*/
public
static
final
String
OUTPUT
=
"output"
;
/**
* args参数值
*/
public
static
final
String
ARGS
=
"args"
;
/**
* args参数值
*/
public
static
final
String
NAME
=
"name"
;
/**
* 关状态
*/
public
static
final
String
CLOSED
=
"0"
;
/**
* data数据
*/
...
...
@@ -82,6 +102,16 @@ public class DatasetConstant {
*/
public
static
final
Integer
CLEAN_FINISHED
=
1
;
/**
* 暂停清洗
*/
public
static
final
Integer
PAUSE_FINISHED
=
2
;
/**
* 空白字符
*/
public
static
final
String
EMPTY_STR
=
""
;
/**
* 文本数据清洗
*/
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/controller/DatasetVersionController.java
View file @
7f447ff0
...
...
@@ -362,7 +362,7 @@ public class DatasetVersionController {
//再存储数据集配置文件
datasetVersionService
.
saveDatasetInfo
(
versionName
);
datasetVersion
.
setFileUrl
(
fullName
);
datasetVersion
.
setInputStatus
(
1
);
datasetVersion
.
setInputStatus
(
DatasetConstant
.
INPUT_STATUS
);
datasetVersion
.
setDataVolume
(
Long
.
valueOf
(
JSON
.
parseArray
(
new
String
(
importFile
.
getBytes
(),
StandardCharsets
.
UTF_8
)).
size
()));
this
.
datasetVersionService
.
updateById
(
datasetVersion
);
return
ResponseResult
.
success
();
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/data/DatasetDataFilter.java
View file @
7f447ff0
...
...
@@ -27,11 +27,17 @@ public class DatasetDataFilter {
@Field
(
"clean_id"
)
private
Long
cleanId
;
/**
* 清洗前数据
*/
@Field
(
"clean_before_data"
)
private
String
cleanBeforeData
;
/**
* 清洗后数据
*/
@Field
(
"c
ontent
"
)
private
String
c
ontent
;
@Field
(
"c
lean_after_data
"
)
private
String
c
leanAfterData
;
/**
* 创建时间
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/data/DatasetRule.java
0 → 100644
View file @
7f447ff0
package
com
.
yice
.
webadmin
.
app
.
data
;
import
lombok.AllArgsConstructor
;
import
lombok.Data
;
import
lombok.NoArgsConstructor
;
@Data
@NoArgsConstructor
@AllArgsConstructor
public
class
DatasetRule
{
/**
* 规则名称
*/
private
String
name
;
/**
* 版本标识
*/
private
double
args
;
@Override
public
String
toString
()
{
return
"DatasetRule{"
+
"args="
+
args
+
", name='"
+
name
+
'\''
+
'}'
;
}
}
application-webadmin/src/main/java/com/yice/webadmin/app/dto/DatasetCleanDto.java
View file @
7f447ff0
...
...
@@ -64,7 +64,7 @@ public class DatasetCleanDto {
/**
* 清洗状态。
*/
@ApiModelProperty
(
value
=
"清洗状态
:0:进行中;1:已完成
"
)
@ApiModelProperty
(
value
=
"清洗状态
;0:未清洗;1:已清洗;2:暂停清洗
"
)
private
Integer
cleanStatus
;
/**
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/dto/DatasetDataFilterDto.java
View file @
7f447ff0
...
...
@@ -29,10 +29,16 @@ public class DatasetDataFilterDto {
private
Long
cleanId
;
/**
*
过滤内容
。
*
清洗前数据
。
*/
@ApiModelProperty
(
value
=
"过滤内容"
)
private
String
content
;
@ApiModelProperty
(
value
=
"清洗前数据"
)
private
String
cleanBeforeData
;
/**
* 清洗后数据。
*/
@ApiModelProperty
(
value
=
"清洗后数据"
)
private
String
cleanAfterData
;
/**
* 创建时间。
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/service/impl/DatasetCleanConfigServiceImpl.java
View file @
7f447ff0
...
...
@@ -244,7 +244,8 @@ public class DatasetCleanConfigServiceImpl extends BaseService<DatasetCleanConfi
List
<
Document
>
documents
=
new
ArrayList
<>();
if
(
CollUtil
.
isNotEmpty
(
filters
))
{
for
(
DatasetDataFilter
filter
:
filters
)
{
Document
document
=
new
Document
(
MongoConstant
.
CONTENT
,
filter
.
getContent
())
Document
document
=
new
Document
(
MongoConstant
.
CLEAN_BEFORE_DATA
,
filter
.
getCleanBeforeData
())
.
append
(
MongoConstant
.
CLEAN_AFTER_DATA
,
filter
.
getCleanAfterData
())
.
append
(
MongoConstant
.
CLEAN_ID
,
filter
.
getCleanId
())
.
append
(
MongoConstant
.
CREATE_TIME
,
new
Date
());
documents
.
add
(
document
);
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/service/impl/DatasetCleanServiceImpl.java
View file @
7f447ff0
This diff is collapsed.
Click to expand it.
application-webadmin/src/main/java/com/yice/webadmin/app/service/impl/DatasetDataServiceImpl.java
View file @
7f447ff0
...
...
@@ -161,6 +161,7 @@ public class DatasetDataServiceImpl implements DatasetDataService {
public
void
updateBatch
(
List
<
DatasetData
>
dataList
,
Long
versionId
)
{
if
(
CollUtil
.
isNotEmpty
(
dataList
))
{
for
(
DatasetData
datasetData
:
dataList
)
{
// 解析data字段的字符串为Document或Bson
Document
dataDocument
=
Document
.
parse
(
datasetData
.
getData
());
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/util/DataCleanerUtil.java
View file @
7f447ff0
This diff is collapsed.
Click to expand it.
application-webadmin/src/main/java/com/yice/webadmin/app/util/JsonNameExtractor.java
View file @
7f447ff0
...
...
@@ -2,6 +2,8 @@ package com.yice.webadmin.app.util;
import
com.fasterxml.jackson.databind.JsonNode
;
import
com.fasterxml.jackson.databind.ObjectMapper
;
import
com.yice.webadmin.app.constant.DatasetConstant
;
import
com.yice.webadmin.app.data.DatasetRule
;
import
lombok.extern.slf4j.Slf4j
;
import
java.io.IOException
;
...
...
@@ -62,4 +64,24 @@ public class JsonNameExtractor {
return
names
;
}
/**
* 拼接名字
* @param rule json名字
* @return 名字列表
*/
public
static
DatasetRule
buildRuleData
(
String
rule
)
{
DatasetRule
datasetRule
=
new
DatasetRule
();
try
{
ObjectMapper
objectMapper
=
new
ObjectMapper
();
JsonNode
config
=
objectMapper
.
readTree
(
rule
);
String
args
=
config
.
get
(
DatasetConstant
.
ARGS
).
textValue
();
datasetRule
.
setArgs
(
Double
.
valueOf
(
args
));
String
name
=
config
.
get
(
DatasetConstant
.
NAME
).
textValue
();
datasetRule
.
setName
(
name
);
}
catch
(
IOException
e
)
{
log
.
error
(
"extract name method overload is error"
,
e
);
}
return
datasetRule
;
}
}
application-webadmin/src/main/java/com/yice/webadmin/app/vo/DatasetCleanVo.java
View file @
7f447ff0
...
...
@@ -64,7 +64,7 @@ public class DatasetCleanVo {
/**
* 清洗状态。
*/
@ApiModelProperty
(
value
=
"清洗状态"
)
@ApiModelProperty
(
value
=
"清洗状态
;0:未清洗;1:已清洗;2:暂停清洗
"
)
private
Integer
cleanStatus
;
/**
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/vo/DatasetDataFilterVo.java
View file @
7f447ff0
...
...
@@ -29,10 +29,16 @@ public class DatasetDataFilterVo {
private
Long
cleanId
;
/**
*
过滤内容
。
*
清洗前数据
。
*/
@ApiModelProperty
(
value
=
"过滤内容"
)
private
String
content
;
@ApiModelProperty
(
value
=
"清洗前数据"
)
private
String
cleanBeforeData
;
/**
* 清洗后数据。
*/
@ApiModelProperty
(
value
=
"清洗后数据"
)
private
String
cleanAfterData
;
/**
* 创建时间。
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment