Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
L
lmp_server
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
lmp
lmp_server
Commits
ccb4b969
Commit
ccb4b969
authored
Apr 10, 2024
by
pengxin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
更新状态
parent
092b5f16
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
46 additions
and
25 deletions
+46
-25
DatasetCleanServiceImpl.java
...ce/webadmin/app/service/impl/DatasetCleanServiceImpl.java
+43
-22
DataCleanerUtil.java
...main/java/com/yice/webadmin/app/util/DataCleanerUtil.java
+3
-3
No files found.
application-webadmin/src/main/java/com/yice/webadmin/app/service/impl/DatasetCleanServiceImpl.java
View file @
ccb4b969
...
...
@@ -110,23 +110,54 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
datasetCleanConfigMapper
.
insert
(
datasetCleanConfig
);
}
DatasetVersion
datasetVersion
=
new
DatasetVersion
();
datasetVersion
.
setVersionId
(
datasetClean
.
getDatasetId
());
datasetVersion
.
setCleanStatus
(
DatasetConstant
.
CLEAN_PROGRESS
);
datasetVersionService
.
updateById
(
datasetVersion
);
updateVersionStatus
(
datasetClean
.
getDatasetId
(),
DatasetConstant
.
CLEAN_PROGRESS
);
doDatasetCleanHandler
(
datasetClean
.
getDatasetId
(),
datasetClean
.
getCleanId
());
return
datasetClean
;
}
/**
* 清洗100个样本
* @param datasetId 数据集对应的版本
*/
@Async
(
"taskExecutor"
)
public
void
doDatasetCleanHandler
(
Long
datasetId
,
Long
cleanId
)
{
MyPageParam
param
=
new
MyPageParam
();
param
.
setPageNum
(
DatasetConstant
.
PAGE_NUM
);
param
.
setPageSize
(
DatasetConstant
.
MAX_PAGE_SIZE
);
List
<
DatasetData
>
dataList
=
datasetDataService
.
list
(
datasetId
,
param
);
if
(
CollUtil
.
isNotEmpty
(
dataList
))
{
Future
<
Void
>
future
=
executeCleanTaskAsync
(
dataList
,
cleanId
,
datasetId
);
futures
.
put
(
cleanId
,
future
);
}
}
/**
* 线程方法
* @param dataList 数据集列表
* @param cleanId 清洗标识
* @param datasetId 清洗集标识
* @return 线程方法
*/
public
Future
<
Void
>
executeCleanTaskAsync
(
List
<
DatasetData
>
dataList
,
Long
cleanId
,
Long
datasetId
)
{
asyncDealWithDatasetSaveBatch
(
dataList
,
cleanId
);
dealWithTaskHandler
(
datasetId
,
cleanId
);
updateCleanStatus
(
cleanId
,
DatasetConstant
.
CLEAN_FINISHED
);
updateVersionStatus
(
datasetId
,
DatasetConstant
.
CLEAN_FINISHED
);
return
new
AsyncResult
<>(
null
);
}
/**
* 更新版本状态
* @param versionId 数据集标识
* @param cleanStatus 清洗状态
*/
private
void
updateVersionStatus
(
Long
versionId
,
Integer
cleanStatus
)
{
DatasetVersion
filter
=
new
DatasetVersion
();
filter
.
setVersionId
(
versionId
);
filter
.
setCleanStatus
(
cleanStatus
);
this
.
datasetVersionService
.
updateById
(
filter
);
}
/**
* 更新清洗状态
* @param cleanId 清洗标识
...
...
@@ -139,21 +170,6 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
this
.
updateById
(
filter
);
}
/**
* 清洗100个样本
* @param datasetId 数据集对应的版本
*/
private
void
doDatasetCleanHandler
(
Long
datasetId
,
Long
cleanId
)
{
MyPageParam
param
=
new
MyPageParam
();
param
.
setPageNum
(
DatasetConstant
.
PAGE_NUM
);
param
.
setPageSize
(
DatasetConstant
.
MAX_PAGE_SIZE
);
List
<
DatasetData
>
dataList
=
datasetDataService
.
list
(
datasetId
,
param
);
if
(
CollUtil
.
isNotEmpty
(
dataList
))
{
Future
<
Void
>
future
=
executeCleanTaskAsync
(
dataList
,
cleanId
,
datasetId
);
futures
.
put
(
cleanId
,
future
);
}
}
/**
* 只异步处理前100条数据清洗数据
* @param dataList 清洗列表
...
...
@@ -180,7 +196,11 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
if
(
future
!=
null
&&
!
future
.
isDone
())
{
future
.
cancel
(
true
);
}
updateCleanStatus
(
cleanId
,
DatasetConstant
.
PAUSE_FINISHED
);
DatasetClean
filter
=
this
.
datasetCleanMapper
.
selectById
(
cleanId
);
if
(
null
!=
filter
){
updateCleanStatus
(
cleanId
,
DatasetConstant
.
PAUSE_FINISHED
);
updateVersionStatus
(
filter
.
getDatasetId
(),
DatasetConstant
.
PAUSE_FINISHED
);
}
}
/**
...
...
@@ -192,8 +212,9 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
DatasetClean
clean
=
this
.
datasetCleanMapper
.
selectById
(
cleanId
);
if
(
null
!=
clean
){
doDatasetCleanHandler
(
clean
.
getDatasetId
(),
cleanId
);
updateCleanStatus
(
cleanId
,
DatasetConstant
.
CLEAN_PROGRESS
);
updateVersionStatus
(
clean
.
getDatasetId
(),
DatasetConstant
.
CLEAN_PROGRESS
);
}
updateCleanStatus
(
cleanId
,
DatasetConstant
.
CLEAN_PROGRESS
);
}
/**
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/util/DataCleanerUtil.java
View file @
ccb4b969
...
...
@@ -143,7 +143,7 @@ public class DataCleanerUtil {
Matcher
matcher
=
pattern
.
matcher
(
data
);
while
(
matcher
.
find
())
{
//如果出现关键字符,则直接替换为空白字符
matcher
.
appendReplacement
(
result
,
"
***
"
);
matcher
.
appendReplacement
(
result
,
""
);
}
matcher
.
appendTail
(
result
);
}
else
{
...
...
@@ -236,7 +236,7 @@ public class DataCleanerUtil {
StringBuffer
result
=
new
StringBuffer
();
double
specialCharacterRatio
=
calculateSpecialCharacterRatio
(
data
);
if
(
specialCharacterRatio
<=
radio
)
{
result
.
append
(
data
.
replaceAll
(
"[#$^&()]"
,
""
));
result
.
append
(
data
.
replaceAll
(
"[#$
*
^&()]"
,
""
));
}
else
{
result
.
append
(
data
);
}
...
...
@@ -253,7 +253,7 @@ public class DataCleanerUtil {
int
specialCharactersCount
=
0
;
for
(
Term
term
:
termList
)
{
if
(
term
.
word
.
matches
(
".*[#$%^&()].*"
))
{
if
(
term
.
word
.
matches
(
".*[#$%
*
^&()].*"
))
{
specialCharactersCount
++;
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment