Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
L
lmp_server
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
lmp
lmp_server
Commits
7f447ff0
Commit
7f447ff0
authored
Apr 08, 2024
by
pengxin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
数据集清洗新增开关模块。
parent
cc809ff6
Changes
15
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
523 additions
and
85 deletions
+523
-85
pom.xml
application-webadmin/pom.xml
+5
-0
DatasetCleanConstant.java
.../com/yice/webadmin/app/constant/DatasetCleanConstant.java
+25
-0
DatasetConstant.java
.../java/com/yice/webadmin/app/constant/DatasetConstant.java
+30
-0
DatasetVersionController.java
...ice/webadmin/app/controller/DatasetVersionController.java
+1
-1
DatasetDataFilter.java
...in/java/com/yice/webadmin/app/data/DatasetDataFilter.java
+8
-2
DatasetRule.java
...src/main/java/com/yice/webadmin/app/data/DatasetRule.java
+30
-0
DatasetCleanDto.java
.../main/java/com/yice/webadmin/app/dto/DatasetCleanDto.java
+1
-1
DatasetDataFilterDto.java
.../java/com/yice/webadmin/app/dto/DatasetDataFilterDto.java
+9
-3
DatasetCleanConfigServiceImpl.java
...admin/app/service/impl/DatasetCleanConfigServiceImpl.java
+2
-1
DatasetCleanServiceImpl.java
...ce/webadmin/app/service/impl/DatasetCleanServiceImpl.java
+108
-42
DatasetDataServiceImpl.java
...ice/webadmin/app/service/impl/DatasetDataServiceImpl.java
+1
-0
DataCleanerUtil.java
...main/java/com/yice/webadmin/app/util/DataCleanerUtil.java
+271
-31
JsonNameExtractor.java
...in/java/com/yice/webadmin/app/util/JsonNameExtractor.java
+22
-0
DatasetCleanVo.java
...rc/main/java/com/yice/webadmin/app/vo/DatasetCleanVo.java
+1
-1
DatasetDataFilterVo.java
...in/java/com/yice/webadmin/app/vo/DatasetDataFilterVo.java
+9
-3
No files found.
application-webadmin/pom.xml
View file @
7f447ff0
...
...
@@ -70,6 +70,11 @@
<artifactId>
opencc4j
</artifactId>
<version>
1.6.2
</version>
</dependency>
<dependency>
<groupId>
com.hankcs
</groupId>
<artifactId>
hanlp
</artifactId>
<version>
portable-1.8.2
</version>
</dependency>
</dependencies>
<build>
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/constant/DatasetCleanConstant.java
View file @
7f447ff0
...
...
@@ -50,4 +50,29 @@ public class DatasetCleanConstant {
*/
public
static
final
String
REPLACE_IDENTIFIER
=
"replace_identifier"
;
/**
* 检查文档的词数目
*/
public
static
final
String
FILTER_CHECK_NUMBER_WORDS
=
"filter_check_number_words"
;
/**
* 检查文档的字重复率
*/
public
static
final
String
FILTER_CHECK_WORD_REPETITION_REMOVAL
=
"filter_check_word_repetition_removal"
;
/**
* 检查文档的词重复率
*/
public
static
final
String
FILTER_CHECK_CHARACTER_REPETITION_REMOVAL
=
"filter_check_character_repetition_removal"
;
/**
* 检查文档的特殊字符率
*/
public
static
final
String
FILTER_CHECK_SPECIAL_CHARACTERS
=
"filter_check_special_characters"
;
/**
* 检查文档的色情暴力词率
*/
public
static
final
String
FILTER_CHECK_FLAGGED_WORDS
=
"filter_check_flagged_words"
;
}
application-webadmin/src/main/java/com/yice/webadmin/app/constant/DatasetConstant.java
View file @
7f447ff0
...
...
@@ -7,6 +7,11 @@ public class DatasetConstant {
*/
public
static
final
Integer
STATUS_UNPUBLISHED
=
0
;
/**
* 已导入
*/
public
static
final
Integer
INPUT_STATUS
=
1
;
/**
* 已发布状态
*/
...
...
@@ -62,6 +67,21 @@ public class DatasetConstant {
*/
public
static
final
String
OUTPUT
=
"output"
;
/**
* args参数值
*/
public
static
final
String
ARGS
=
"args"
;
/**
* args参数值
*/
public
static
final
String
NAME
=
"name"
;
/**
* 关状态
*/
public
static
final
String
CLOSED
=
"0"
;
/**
* data数据
*/
...
...
@@ -82,6 +102,16 @@ public class DatasetConstant {
*/
public
static
final
Integer
CLEAN_FINISHED
=
1
;
/**
* 暂停清洗
*/
public
static
final
Integer
PAUSE_FINISHED
=
2
;
/**
* 空白字符
*/
public
static
final
String
EMPTY_STR
=
""
;
/**
* 文本数据清洗
*/
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/controller/DatasetVersionController.java
View file @
7f447ff0
...
...
@@ -362,7 +362,7 @@ public class DatasetVersionController {
//再存储数据集配置文件
datasetVersionService
.
saveDatasetInfo
(
versionName
);
datasetVersion
.
setFileUrl
(
fullName
);
datasetVersion
.
setInputStatus
(
1
);
datasetVersion
.
setInputStatus
(
DatasetConstant
.
INPUT_STATUS
);
datasetVersion
.
setDataVolume
(
Long
.
valueOf
(
JSON
.
parseArray
(
new
String
(
importFile
.
getBytes
(),
StandardCharsets
.
UTF_8
)).
size
()));
this
.
datasetVersionService
.
updateById
(
datasetVersion
);
return
ResponseResult
.
success
();
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/data/DatasetDataFilter.java
View file @
7f447ff0
...
...
@@ -27,11 +27,17 @@ public class DatasetDataFilter {
@Field
(
"clean_id"
)
private
Long
cleanId
;
/**
* 清洗前数据
*/
@Field
(
"clean_before_data"
)
private
String
cleanBeforeData
;
/**
* 清洗后数据
*/
@Field
(
"c
ontent
"
)
private
String
c
ontent
;
@Field
(
"c
lean_after_data
"
)
private
String
c
leanAfterData
;
/**
* 创建时间
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/data/DatasetRule.java
0 → 100644
View file @
7f447ff0
package
com
.
yice
.
webadmin
.
app
.
data
;
import
lombok.AllArgsConstructor
;
import
lombok.Data
;
import
lombok.NoArgsConstructor
;
@Data
@NoArgsConstructor
@AllArgsConstructor
public
class
DatasetRule
{
/**
* 规则名称
*/
private
String
name
;
/**
* 版本标识
*/
private
double
args
;
@Override
public
String
toString
()
{
return
"DatasetRule{"
+
"args="
+
args
+
", name='"
+
name
+
'\''
+
'}'
;
}
}
application-webadmin/src/main/java/com/yice/webadmin/app/dto/DatasetCleanDto.java
View file @
7f447ff0
...
...
@@ -64,7 +64,7 @@ public class DatasetCleanDto {
/**
* 清洗状态。
*/
@ApiModelProperty
(
value
=
"清洗状态
:0:进行中;1:已完成
"
)
@ApiModelProperty
(
value
=
"清洗状态
;0:未清洗;1:已清洗;2:暂停清洗
"
)
private
Integer
cleanStatus
;
/**
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/dto/DatasetDataFilterDto.java
View file @
7f447ff0
...
...
@@ -29,10 +29,16 @@ public class DatasetDataFilterDto {
private
Long
cleanId
;
/**
*
过滤内容
。
*
清洗前数据
。
*/
@ApiModelProperty
(
value
=
"过滤内容"
)
private
String
content
;
@ApiModelProperty
(
value
=
"清洗前数据"
)
private
String
cleanBeforeData
;
/**
* 清洗后数据。
*/
@ApiModelProperty
(
value
=
"清洗后数据"
)
private
String
cleanAfterData
;
/**
* 创建时间。
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/service/impl/DatasetCleanConfigServiceImpl.java
View file @
7f447ff0
...
...
@@ -244,7 +244,8 @@ public class DatasetCleanConfigServiceImpl extends BaseService<DatasetCleanConfi
List
<
Document
>
documents
=
new
ArrayList
<>();
if
(
CollUtil
.
isNotEmpty
(
filters
))
{
for
(
DatasetDataFilter
filter
:
filters
)
{
Document
document
=
new
Document
(
MongoConstant
.
CONTENT
,
filter
.
getContent
())
Document
document
=
new
Document
(
MongoConstant
.
CLEAN_BEFORE_DATA
,
filter
.
getCleanBeforeData
())
.
append
(
MongoConstant
.
CLEAN_AFTER_DATA
,
filter
.
getCleanAfterData
())
.
append
(
MongoConstant
.
CLEAN_ID
,
filter
.
getCleanId
())
.
append
(
MongoConstant
.
CREATE_TIME
,
new
Date
());
documents
.
add
(
document
);
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/service/impl/DatasetCleanServiceImpl.java
View file @
7f447ff0
...
...
@@ -27,6 +27,7 @@ import com.yice.webadmin.app.service.DatasetVersionService;
import
com.yice.webadmin.app.util.DataCleanerUtil
;
import
com.yice.webadmin.app.util.JsonNameExtractor
;
import
lombok.extern.slf4j.Slf4j
;
import
org.apache.commons.lang3.StringUtils
;
import
org.springframework.beans.factory.annotation.Autowired
;
import
org.springframework.scheduling.annotation.Async
;
import
org.springframework.scheduling.annotation.AsyncResult
;
...
...
@@ -37,6 +38,7 @@ import java.io.File;
import
java.io.FileWriter
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.Arrays
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.concurrent.ConcurrentHashMap
;
...
...
@@ -176,6 +178,13 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
Future
<?>
future
=
futures
.
remove
(
cleanId
);
if
(
future
!=
null
&&
!
future
.
isDone
())
{
future
.
cancel
(
true
);
//暂停清洗
DatasetClean
filter
=
new
DatasetClean
();
filter
.
setCleanStatus
(
DatasetConstant
.
PAUSE_FINISHED
);
filter
.
setFinishTime
(
null
);
filter
.
setCleanId
(
cleanId
);
this
.
updateById
(
filter
);
}
}
...
...
@@ -204,42 +213,63 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
*/
private
void
dealWithTaskHandler
(
Long
datasetId
,
Long
cleanId
)
{
try
{
List
<
String
>
rules
=
new
ArrayList
<>();
DatasetCleanConfig
filter
=
new
DatasetCleanConfig
();
filter
.
setCleanId
(
cleanId
);
DatasetCleanConfig
config
=
datasetCleanConfigService
.
getOne
(
filter
);
if
(
null
!=
config
)
{
rules
.
add
(
config
.
getFilterConfig
());
rules
.
add
(
config
.
getDesensitiveConfig
());
rules
.
add
(
config
.
getDeduplicateConfig
());
rules
.
add
(
config
.
getCleanConfig
());
rules
=
rules
.
stream
()
.
filter
(
rule
->
rule
!=
null
&&
!
rule
.
isEmpty
())
.
collect
(
Collectors
.
toList
());
rules
=
JsonNameExtractor
.
extractNames
(
rules
);
}
DatasetVersion
datasetVersion
=
this
.
datasetVersionService
.
getById
(
datasetId
);
datasetVersionService
.
saveDatasetInfo
(
datasetVersion
.
getVersionName
());
clearFileDatasetData
(
datasetVersion
.
getFileUrl
());
Long
count
=
datasetDataService
.
count
(
datasetId
);
int
pageSize
=
DatasetConstant
.
MAX_SIZE
;
int
totalPages
=
(
int
)
Math
.
ceil
((
double
)
count
/
pageSize
);
MyPageParam
param
;
for
(
int
i
=
1
;
i
<=
totalPages
;
i
++)
{
param
=
new
MyPageParam
();
param
.
setPageNum
(
i
);
param
.
setPageSize
(
pageSize
);
List
<
DatasetData
>
dataList
=
datasetDataService
.
list
(
datasetId
,
param
);
dealWithDatasetNodeData
(
dataList
,
datasetId
,
rules
);
appendDataListToFile
(
datasetVersion
.
getFileUrl
()
,
dataList
);
if
(
count
>
0
)
{
List
<
DatasetRule
>
rules
=
buildRulesList
(
cleanId
);
int
pageSize
=
DatasetConstant
.
MAX_SIZE
;
int
totalPages
=
(
int
)
Math
.
ceil
((
double
)
count
/
pageSize
);
MyPageParam
param
;
for
(
int
i
=
1
;
i
<=
totalPages
;
i
++)
{
param
=
new
MyPageParam
();
param
.
setPageNum
(
i
);
param
.
setPageSize
(
pageSize
);
List
<
DatasetData
>
dataList
=
datasetDataService
.
list
(
datasetId
,
param
);
List
<
DatasetData
>
newDataList
=
dealWithDatasetNodeData
(
dataList
,
datasetId
,
rules
);
if
(
CollUtil
.
isNotEmpty
(
newDataList
))
{
appendDataListToFile
(
datasetVersion
.
getFileUrl
()
,
newDataList
);
}
}
}
}
catch
(
Exception
ex
)
{
log
.
error
(
"deal with task handler is error:"
,
ex
);
}
}
/**
* 构建规则列表
* @param cleanId 清洗标识
* @return 规则列表
*/
private
List
<
DatasetRule
>
buildRulesList
(
Long
cleanId
)
{
DatasetCleanConfig
cleanConfig
=
new
DatasetCleanConfig
();
cleanConfig
.
setCleanId
(
cleanId
);
DatasetCleanConfig
datasetCleanConfig
=
datasetCleanConfigService
.
getOne
(
cleanConfig
);
List
<
DatasetRule
>
rules
=
new
ArrayList
<>();
if
(
null
!=
datasetCleanConfig
)
{
String
[]
jsonStrings
=
{
datasetCleanConfig
.
getFilterConfig
(),
datasetCleanConfig
.
getDesensitiveConfig
(),
datasetCleanConfig
.
getDesensitiveConfig
(),
datasetCleanConfig
.
getDeduplicateConfig
(),
datasetCleanConfig
.
getCleanConfig
()};
ObjectMapper
objectMapper
=
new
ObjectMapper
();
rules
=
Arrays
.
stream
(
jsonStrings
)
.
map
(
jsonString
->
{
try
{
return
objectMapper
.
readValue
(
jsonString
,
DatasetRule
[].
class
);
}
catch
(
JsonProcessingException
e
)
{
log
.
error
(
"json processing exception is error:"
,
e
);
return
null
;
}
})
.
flatMap
(
Arrays:
:
stream
)
.
collect
(
Collectors
.
toList
());
}
return
rules
;
}
/**
* 第一个方法:清空文件
* @param filePath 文件地址
...
...
@@ -299,7 +329,9 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
* @param datasetId 数据集标识
* @param rules 规则列表
*/
private
void
dealWithDatasetNodeData
(
List
<
DatasetData
>
dataList
,
Long
datasetId
,
List
<
String
>
rules
)
{
private
List
<
DatasetData
>
dealWithDatasetNodeData
(
List
<
DatasetData
>
dataList
,
Long
datasetId
,
List
<
DatasetRule
>
rules
)
{
List
<
DatasetData
>
newDataList
=
new
ArrayList
<>();
try
{
if
(
CollUtil
.
isNotEmpty
(
dataList
))
{
ObjectMapper
objectMapper
=
new
ObjectMapper
();
...
...
@@ -307,13 +339,19 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
JsonNode
rootNode
=
objectMapper
.
readTree
(
datasetData
.
getData
());
String
data
=
rootNode
.
get
(
DatasetConstant
.
OUTPUT
).
textValue
();
String
output
=
DataCleanerUtil
.
buildCleanAfterData
(
data
,
rules
);
datasetData
.
setData
(
createNewDataNode
(
datasetData
.
getData
(),
output
));
//校验清洗后的数据是否满足条件,如果满足条件,则进行添加,否则直接移除。
if
(
StringUtils
.
isNotBlank
(
output
))
{
datasetData
.
setData
(
createNewDataNode
(
datasetData
.
getData
(),
output
));
newDataList
.
add
(
datasetData
);
}
}
this
.
datasetDataService
.
updateBatch
(
d
ataList
,
datasetId
);
this
.
datasetDataService
.
updateBatch
(
newD
ataList
,
datasetId
);
}
}
catch
(
JsonProcessingException
ex
){
log
.
error
(
"deal with dataset node data:"
,
ex
);
}
return
newDataList
;
}
/**
...
...
@@ -359,15 +397,22 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
filter
.
setCleanId
(
cleanId
);
DatasetCleanConfig
cleanConfig
=
datasetCleanConfigService
.
getOne
(
filter
);
if
(
null
!=
cleanConfig
&&
null
==
cleanConfig
.
getCleanConfig
())
return
cleans
;
List
<
String
>
rules
=
JsonNameExtractor
.
extractNames
(
cleanConfig
.
getCleanConfig
());
if
(
CollUtil
.
isNotEmpty
(
dataList
))
{
ObjectMapper
objectMapper
=
new
ObjectMapper
();
DatasetRule
[]
datasetRules
=
objectMapper
.
readValue
(
cleanConfig
.
getCleanConfig
(),
DatasetRule
[].
class
);
List
<
DatasetRule
>
rules
=
Arrays
.
asList
(
datasetRules
);
for
(
DatasetData
datasetData:
dataList
)
{
DatasetDataClean
dataClean
=
new
DatasetDataClean
();
JsonNode
rootNode
=
objectMapper
.
readTree
(
datasetData
.
getData
());
String
data
=
rootNode
.
get
(
DatasetConstant
.
OUTPUT
).
textValue
();
dataClean
.
setCleanBeforeData
(
data
);
dataClean
.
setCleanAfterData
(
DataCleanerUtil
.
buildCleanAfterData
(
data
,
rules
));
String
cleanAfterData
=
DataCleanerUtil
.
buildCleanAfterData
(
data
,
rules
);
if
(
StringUtils
.
isEmpty
(
cleanAfterData
))
break
;
dataClean
.
setCleanAfterData
(
cleanAfterData
);
dataClean
.
setCleanId
(
cleanId
);
dataClean
.
setCreateTime
(
new
Date
());
cleans
.
add
(
dataClean
);
...
...
@@ -391,17 +436,24 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
try
{
DatasetCleanConfig
filter
=
new
DatasetCleanConfig
();
filter
.
setCleanId
(
cleanId
);
DatasetCleanConfig
cleanConfig
=
datasetCleanConfigService
.
getOne
(
filter
);
if
(
null
!=
cleanConfig
&&
null
==
cleanConfig
.
getDesensitiveConfig
())
return
desensitives
;
List
<
String
>
rules
=
JsonNameExtractor
.
extractNames
(
cleanConfig
.
getDesensitiveConfig
());
ObjectMapper
objectMapper
=
new
ObjectMapper
();
DatasetCleanConfig
desenstiveCfg
=
datasetCleanConfigService
.
getOne
(
filter
);
if
(
null
!=
desenstiveCfg
&&
null
==
desenstiveCfg
.
getDesensitiveConfig
())
return
desensitives
;
if
(
CollUtil
.
isNotEmpty
(
dataList
))
{
ObjectMapper
objectMapper
=
new
ObjectMapper
();
DatasetRule
[]
datasets
=
objectMapper
.
readValue
(
desenstiveCfg
.
getDesensitiveConfig
(),
DatasetRule
[].
class
);
List
<
DatasetRule
>
rules
=
Arrays
.
asList
(
datasets
);
for
(
DatasetData
datasetData:
dataList
)
{
DatasetDataDesensitive
desensitive
=
new
DatasetDataDesensitive
();
JsonNode
node
=
objectMapper
.
readTree
(
datasetData
.
getData
());
String
data
=
node
.
get
(
DatasetConstant
.
OUTPUT
).
textValue
();
String
cleanAfterData
=
DataCleanerUtil
.
buildCleanAfterData
(
data
,
rules
);
if
(
StringUtils
.
isEmpty
(
cleanAfterData
))
break
;
desensitive
.
setCleanBeforeData
(
data
);
desensitive
.
setCleanAfterData
(
DataCleanerUtil
.
buildCleanAfterData
(
data
,
rules
)
);
desensitive
.
setCleanAfterData
(
cleanAfterData
);
desensitive
.
setCleanId
(
cleanId
);
desensitive
.
setCreateTime
(
new
Date
());
desensitives
.
add
(
desensitive
);
...
...
@@ -427,15 +479,22 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
filter
.
setCleanId
(
cleanId
);
DatasetCleanConfig
deduplicateConfig
=
datasetCleanConfigService
.
getOne
(
filter
);
if
(
null
!=
deduplicateConfig
&&
null
==
deduplicateConfig
.
getDeduplicateConfig
())
return
deduplicates
;
List
<
String
>
rules
=
JsonNameExtractor
.
extractNames
(
deduplicateConfig
.
getDeduplicateConfig
());
ObjectMapper
objectMapper
=
new
ObjectMapper
();
if
(
CollUtil
.
isNotEmpty
(
dataList
))
{
ObjectMapper
objectMapper
=
new
ObjectMapper
();
DatasetRule
[]
datasetRules
=
objectMapper
.
readValue
(
deduplicateConfig
.
getDeduplicateConfig
(),
DatasetRule
[].
class
);
List
<
DatasetRule
>
rules
=
Arrays
.
asList
(
datasetRules
);
for
(
DatasetData
datasetData:
dataList
)
{
DatasetDataDeduplicate
deduplicate
=
new
DatasetDataDeduplicate
();
JsonNode
jsonNode
=
objectMapper
.
readTree
(
datasetData
.
getData
());
String
data
=
jsonNode
.
get
(
DatasetConstant
.
OUTPUT
).
textValue
();
String
cleanAfterData
=
DataCleanerUtil
.
buildCleanAfterData
(
data
,
rules
);
if
(
StringUtils
.
isEmpty
(
cleanAfterData
))
break
;
deduplicate
.
setCleanBeforeData
(
data
);
deduplicate
.
setCleanAfterData
(
DataCleanerUtil
.
buildCleanAfterData
(
data
,
rules
)
);
deduplicate
.
setCleanAfterData
(
cleanAfterData
);
deduplicate
.
setCleanId
(
cleanId
);
deduplicate
.
setCreateTime
(
new
Date
());
deduplicates
.
add
(
deduplicate
);
...
...
@@ -461,17 +520,24 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
filter
.
setCleanId
(
cleanId
);
DatasetCleanConfig
filterConfig
=
datasetCleanConfigService
.
getOne
(
filter
);
if
(
null
!=
filterConfig
&&
null
==
filterConfig
.
getFilterConfig
())
return
filters
;
List
<
String
>
rules
=
JsonNameExtractor
.
extractNames
(
filterConfig
.
getFilterConfig
());
ObjectMapper
objectMapper
=
new
ObjectMapper
();
if
(
CollUtil
.
isNotEmpty
(
dataList
))
{
ObjectMapper
objectMapper
=
new
ObjectMapper
();
DatasetRule
[]
datasetRules
=
objectMapper
.
readValue
(
filterConfig
.
getFilterConfig
(),
DatasetRule
[].
class
);
List
<
DatasetRule
>
rules
=
Arrays
.
asList
(
datasetRules
);
for
(
DatasetData
datasetData:
dataList
)
{
DatasetDataFilter
dataFilter
=
new
DatasetDataFilter
();
dataFilter
.
setCleanId
(
cleanId
);
dataFilter
.
setCreateTime
(
new
Date
());
//TODO
JsonNode
rootNode
=
objectMapper
.
readTree
(
datasetData
.
getData
());
String
data
=
rootNode
.
get
(
DatasetConstant
.
OUTPUT
).
textValue
();
dataFilter
.
setContent
(
data
);
String
cleanAfterData
=
DataCleanerUtil
.
buildCleanAfterData
(
data
,
rules
);
if
(
StringUtils
.
isEmpty
(
cleanAfterData
))
break
;
dataFilter
.
setCleanBeforeData
(
data
);
dataFilter
.
setCleanAfterData
(
cleanAfterData
);
filters
.
add
(
dataFilter
);
}
}
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/service/impl/DatasetDataServiceImpl.java
View file @
7f447ff0
...
...
@@ -161,6 +161,7 @@ public class DatasetDataServiceImpl implements DatasetDataService {
public
void
updateBatch
(
List
<
DatasetData
>
dataList
,
Long
versionId
)
{
if
(
CollUtil
.
isNotEmpty
(
dataList
))
{
for
(
DatasetData
datasetData
:
dataList
)
{
// 解析data字段的字符串为Document或Bson
Document
dataDocument
=
Document
.
parse
(
datasetData
.
getData
());
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/util/DataCleanerUtil.java
View file @
7f447ff0
package
com
.
yice
.
webadmin
.
app
.
util
;
import
com.github.houbb.opencc4j.util.ZhConverterUtil
;
import
com.hankcs.hanlp.HanLP
;
import
com.hankcs.hanlp.seg.common.Term
;
import
com.yice.webadmin.app.constant.DatasetCleanConstant
;
import
com.yice.webadmin.app.constant.DatasetConstant
;
import
com.yice.webadmin.app.data.DatasetRule
;
import
lombok.extern.slf4j.Slf4j
;
import
java.util.List
;
import
java.util.*
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
/**
* 数据清洗工具类
*/
@Slf4j
public
class
DataCleanerUtil
{
private
static
final
Set
<
String
>
badWords
=
new
HashSet
<>();
static
{
badWords
.
add
(
"色情"
);
badWords
.
add
(
"淫秽"
);
badWords
.
add
(
"迷信"
);
badWords
.
add
(
"黄色"
);
badWords
.
add
(
"性行为"
);
badWords
.
add
(
"暴力"
);
}
/**
* 定义清洗后的数据
*
...
...
@@ -16,38 +36,258 @@ public class DataCleanerUtil {
* @param rules 清洗规则
* @return 返回清洗后的数据
*/
public
static
String
buildCleanAfterData
(
String
data
,
List
<
String
>
rules
)
{
for
(
String
rule
:
rules
)
{
switch
(
rule
)
{
case
DatasetCleanConstant
.
REMOVE_INVISIBLE_CHARACTER
:
data
=
data
.
replaceAll
(
"[\\p{C}]"
,
" "
);
break
;
case
DatasetCleanConstant
.
REPLACE_UNIFORM_WHITESPACE
:
data
=
data
.
replaceAll
(
"[\\p{Cs}\\p{Co}\\p{Cn}]"
,
""
);
break
;
case
DatasetCleanConstant
.
REMOVE_NON_MEANING_CHARACTERS
:
data
=
data
.
replaceAll
(
"[\\p{Z}\\u2000-\\u200A\\u2028\\u2029\\u3000]"
,
""
);
break
;
case
DatasetCleanConstant
.
REPLACE_TRADITIONAL_CHINESE_TO_SIMPLIFIED
:
data
=
ZhConverterUtil
.
toSimple
(
data
);
break
;
case
DatasetCleanConstant
.
REMOVE_WEB_IDENTIFIERS
:
data
=
data
.
replaceAll
(
"<[^>]*>"
,
""
);
break
;
case
DatasetCleanConstant
.
REMOVE_EMOJI
:
data
=
data
.
replaceAll
(
"[\\ud83c[\\udffb-\\udfff]|\\ud83d[\\udc00-\\ude4f]|\\ud83d[\\ude80-\\udeff]|\\ud83e[\\udd10-\\uddff]]"
,
""
);
break
;
case
DatasetCleanConstant
.
REPLACE_EMAILS
:
data
=
data
.
replaceAll
(
"\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b"
,
"EMAIL"
);
break
;
case
DatasetCleanConstant
.
REPLACE_IP
:
data
=
data
.
replaceAll
(
"\\b(25[0-5]\\.|2[0-4][0-9]\\.|[01]?[0-9][0-9]?\\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\b"
,
"IP_ADDRESS"
);
break
;
case
DatasetCleanConstant
.
REPLACE_IDENTIFIER
:
data
=
data
.
replaceAll
(
"\\d+"
,
"PI:KEY"
);
break
;
public
static
String
buildCleanAfterData
(
String
data
,
List
<
DatasetRule
>
rules
)
{
StringBuilder
sb
=
new
StringBuilder
();
for
(
DatasetRule
rule
:
rules
)
{
if
(
rule
.
getArgs
()
>
0
)
{
data
=
buildJsonData
(
rule
.
getName
(),
data
,
rule
.
getArgs
());
sb
.
append
(
data
);
}
}
return
sb
.
toString
();
}
/**
* 新增过滤条件
* @param rule 规则
* @param data 清洗数据
* @param radio 阀值
* @return 清洗后的数据
*/
private
static
String
buildJsonData
(
String
rule
,
String
data
,
double
radio
)
{
switch
(
rule
)
{
case
DatasetCleanConstant
.
REMOVE_INVISIBLE_CHARACTER
:
data
=
data
.
replaceAll
(
"[\\x00-\\x1F\\x7F-\\x9F]"
,
""
);
break
;
case
DatasetCleanConstant
.
REPLACE_UNIFORM_WHITESPACE
:
data
=
data
.
replaceAll
(
"[\\u0020\\u00A0\\u2000-\\u200A\\u2028\\u2029\\u3000]+"
,
""
);
break
;
case
DatasetCleanConstant
.
REMOVE_NON_MEANING_CHARACTERS
:
data
=
data
.
replaceAll
(
"[\\p{Cntrl}\\p{Cn}]"
,
""
);
break
;
case
DatasetCleanConstant
.
REPLACE_TRADITIONAL_CHINESE_TO_SIMPLIFIED
:
data
=
ZhConverterUtil
.
toSimple
(
data
);
break
;
case
DatasetCleanConstant
.
REMOVE_WEB_IDENTIFIERS
:
data
=
data
.
replaceAll
(
"<[^>]*>"
,
""
);
break
;
case
DatasetCleanConstant
.
REMOVE_EMOJI
:
data
=
data
.
replaceAll
(
"[\\uE000-\\uF8FF]|\ud83c[\\ud000-\\udfff]|\ud83d[\\ud000-\\udfff]|\ud83e[\\ud000-\\udfff]"
,
""
);
break
;
case
DatasetCleanConstant
.
REPLACE_EMAILS
:
data
=
data
.
replaceAll
(
"\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b"
,
"EMAIL"
);
break
;
case
DatasetCleanConstant
.
REPLACE_IP
:
data
=
data
.
replaceAll
(
"\\b(25[0-5]\\.|2[0-4][0-9]\\.|[01]?[0-9][0-9]?\\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\b"
,
"IP_ADDRESS"
);
break
;
case
DatasetCleanConstant
.
REPLACE_IDENTIFIER
:
data
=
data
.
replaceAll
(
"\\d+"
,
""
);
break
;
case
DatasetCleanConstant
.
FILTER_CHECK_NUMBER_WORDS
:
data
=
filterNumberWords
(
data
,
0
,
(
int
)
radio
);
break
;
case
DatasetCleanConstant
.
FILTER_CHECK_WORD_REPETITION_REMOVAL
:
data
=
filterWordRepetition
(
data
,
radio
);
break
;
case
DatasetCleanConstant
.
FILTER_CHECK_CHARACTER_REPETITION_REMOVAL
:
data
=
filterCharacterRepetition
(
data
,
radio
);
break
;
case
DatasetCleanConstant
.
FILTER_CHECK_SPECIAL_CHARACTERS
:
data
=
filterSpecialCharacters
(
data
,
radio
);
break
;
case
DatasetCleanConstant
.
FILTER_CHECK_FLAGGED_WORDS
:
data
=
filterCheckFlaggedWords
(
data
,
radio
);
break
;
}
return
data
;
}
public
static
void
main
(
String
[]
args
)
{
String
data
=
"我们是中国人,我们是地球人,我们要团结起来。团结就是力量,力量就是一切。"
;
data
=
filterWordRepetition
(
data
,
0.3
);
System
.
out
.
println
(
data
);
}
/**
* 计算阀值
* @param data 计算坏的数据
* @return 返回阀值
*/
private
static
double
calculateBadWordRatio
(
String
data
)
{
// 标准分词
List
<
Term
>
termList
=
HanLP
.
segment
(
data
);
int
badWordsCount
=
0
;
for
(
Term
term
:
termList
)
{
if
(
badWords
.
contains
(
term
.
word
.
toLowerCase
()))
{
badWordsCount
++;
}
}
return
badWordsCount
/
(
double
)
termList
.
size
();
}
/**
* 检查文档的色情暴力词率
* @param data 过滤数据
* @param radio 阀值
* @return 清洗后的数据
*/
private
static
String
filterCheckFlaggedWords
(
String
data
,
double
radio
)
{
StringBuffer
result
=
new
StringBuffer
();
double
badWordRatio
=
calculateBadWordRatio
(
data
);
if
(
badWordRatio
>
radio
)
{
Pattern
pattern
=
Pattern
.
compile
(
"("
+
String
.
join
(
"|"
,
badWords
)
+
")(?![\\w])"
);
Matcher
matcher
=
pattern
.
matcher
(
data
);
while
(
matcher
.
find
())
{
//如果出现关键字符,则直接替换为空白字符
matcher
.
appendReplacement
(
result
,
""
);
}
matcher
.
appendTail
(
result
);
}
else
{
result
.
append
(
data
);
}
return
result
.
toString
();
}
/**
* 检查文档的词重复率
* 计算词重复率并替换重复内容
* @param document 文档内容字符串
* @param threshold 阈值比率
* @return 处理后的文档内容,如果词重复率超过阈值,则重复内容被替换为空字符
*/
public
static
String
filterCharacterRepetition
(
String
document
,
double
threshold
)
{
// 使用HanLP进行分词
List
<
Term
>
termList
=
HanLP
.
segment
(
document
);
// 统计每个词的出现次数
Map
<
String
,
Integer
>
wordCountMap
=
new
HashMap
<>();
for
(
Term
term
:
termList
)
{
String
word
=
term
.
word
;
wordCountMap
.
put
(
word
,
wordCountMap
.
getOrDefault
(
word
,
0
)
+
1
);
}
// 计算总词数
int
totalWords
=
termList
.
size
();
// 计算重复词的次数(即出现次数大于1的词的总次数)
int
repeatedWordsCount
=
0
;
for
(
int
count
:
wordCountMap
.
values
())
{
if
(
count
>
1
)
{
// 只计算重复的次数
repeatedWordsCount
+=
count
-
1
;
}
}
// 计算词重复率
double
repetitionRate
=
(
double
)
repeatedWordsCount
/
totalWords
;
return
repetitionRate
<
threshold
?
document
:
DatasetConstant
.
EMPTY_STR
;
}
/**
* 检查文档的字重复率
* @param text 清洗数据
* @param threshold 阀值
* @return 清洗后的数据
*/
public
static
String
filterWordRepetition
(
String
text
,
double
threshold
)
{
// 将文本转换为字符数组
char
[]
characters
=
text
.
toCharArray
();
// 统计每个字的出现次数
Map
<
Character
,
Integer
>
characterCountMap
=
new
HashMap
<>();
for
(
char
c
:
characters
)
{
characterCountMap
.
put
(
c
,
characterCountMap
.
getOrDefault
(
c
,
0
)
+
1
);
}
// 计算总字数
int
totalCharacters
=
characters
.
length
;
// 计算重复字的次数(即出现次数大于1的字的总次数)
int
repeatedCharactersCount
=
0
;
for
(
int
count
:
characterCountMap
.
values
())
{
if
(
count
>
1
)
{
// 只计算重复的次数
repeatedCharactersCount
+=
count
-
1
;
}
}
// 计算字重复率
double
repetitionRate
=
(
double
)
repeatedCharactersCount
/
totalCharacters
;
return
repetitionRate
<
threshold
?
text
:
DatasetConstant
.
EMPTY_STR
;
}
/**
* 过滤检查文档的特殊字符率
* @param data 过滤数据
* @param radio 最大长度
* @return 返回截取后的字符串
*/
private
static
String
filterSpecialCharacters
(
String
data
,
double
radio
)
{
StringBuffer
result
=
new
StringBuffer
();
double
specialCharacterRatio
=
calculateSpecialCharacterRatio
(
data
);
if
(
specialCharacterRatio
>
radio
)
{
result
.
append
(
data
.
replaceAll
(
"[#$%^&*()]"
,
""
));
}
else
{
result
.
append
(
data
);
}
return
result
.
toString
();
}
/**
* 计算特殊字符的阀值
* @param data 数据
* @return 阀值
*/
private
static
double
calculateSpecialCharacterRatio
(
String
data
)
{
List
<
Term
>
termList
=
HanLP
.
segment
(
data
);
int
specialCharactersCount
=
0
;
for
(
Term
term
:
termList
)
{
if
(
term
.
word
.
matches
(
".*[#$%^&*()].*"
))
{
specialCharactersCount
++;
}
}
return
specialCharactersCount
/
(
double
)
termList
.
size
();
}
/**
* 过滤检查文档的词数目
* @param data 过滤数据
* @param radio 最大词条目数据
* @return 返回截取后的字符串
*/
public
static
String
filterNumberWords
(
String
data
,
int
radio
)
{
data
=
data
.
toLowerCase
();
if
(
data
.
length
()
<=
radio
)
{
return
data
;
}
int
startIndex
=
data
.
length
()
-
radio
;
return
data
.
substring
(
startIndex
);
}
/**
* 过滤检查文档的词数目
* @param content 过滤数据
* @param minWordCount 最小词条目数据
* @param maxWordCount 最大词条目数据
* @return 返回截取后的字符串
*/
private
static
String
filterNumberWords
(
String
content
,
int
minWordCount
,
int
maxWordCount
)
{
// 标准分词
List
<
Term
>
termList
=
HanLP
.
segment
(
content
);
// 计算词的数量
int
wordCount
=
termList
.
size
();
// 检查词的数量是否在指定范围内
if
(
wordCount
<
minWordCount
||
wordCount
>
maxWordCount
)
{
// 如果词的数量不在范围内,返回空字符串
return
DatasetConstant
.
EMPTY_STR
;
}
// 如果词的数量在范围内,返回原始文本
return
content
;
}
}
application-webadmin/src/main/java/com/yice/webadmin/app/util/JsonNameExtractor.java
View file @
7f447ff0
...
...
@@ -2,6 +2,8 @@ package com.yice.webadmin.app.util;
import
com.fasterxml.jackson.databind.JsonNode
;
import
com.fasterxml.jackson.databind.ObjectMapper
;
import
com.yice.webadmin.app.constant.DatasetConstant
;
import
com.yice.webadmin.app.data.DatasetRule
;
import
lombok.extern.slf4j.Slf4j
;
import
java.io.IOException
;
...
...
@@ -62,4 +64,24 @@ public class JsonNameExtractor {
return
names
;
}
/**
* 拼接名字
* @param rule json名字
* @return 名字列表
*/
public
static
DatasetRule
buildRuleData
(
String
rule
)
{
DatasetRule
datasetRule
=
new
DatasetRule
();
try
{
ObjectMapper
objectMapper
=
new
ObjectMapper
();
JsonNode
config
=
objectMapper
.
readTree
(
rule
);
String
args
=
config
.
get
(
DatasetConstant
.
ARGS
).
textValue
();
datasetRule
.
setArgs
(
Double
.
valueOf
(
args
));
String
name
=
config
.
get
(
DatasetConstant
.
NAME
).
textValue
();
datasetRule
.
setName
(
name
);
}
catch
(
IOException
e
)
{
log
.
error
(
"extract name method overload is error"
,
e
);
}
return
datasetRule
;
}
}
application-webadmin/src/main/java/com/yice/webadmin/app/vo/DatasetCleanVo.java
View file @
7f447ff0
...
...
@@ -64,7 +64,7 @@ public class DatasetCleanVo {
/**
* 清洗状态。
*/
@ApiModelProperty
(
value
=
"清洗状态"
)
@ApiModelProperty
(
value
=
"清洗状态
;0:未清洗;1:已清洗;2:暂停清洗
"
)
private
Integer
cleanStatus
;
/**
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/vo/DatasetDataFilterVo.java
View file @
7f447ff0
...
...
@@ -29,10 +29,16 @@ public class DatasetDataFilterVo {
private
Long
cleanId
;
/**
*
过滤内容
。
*
清洗前数据
。
*/
@ApiModelProperty
(
value
=
"过滤内容"
)
private
String
content
;
@ApiModelProperty
(
value
=
"清洗前数据"
)
private
String
cleanBeforeData
;
/**
* 清洗后数据。
*/
@ApiModelProperty
(
value
=
"清洗后数据"
)
private
String
cleanAfterData
;
/**
* 创建时间。
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment