Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
L
lmp_server
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
lmp
lmp_server
Commits
74073a00
Commit
74073a00
authored
Apr 09, 2024
by
pengxin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
调整清洗事务管理。
parent
2cc62002
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
45 additions
and
31 deletions
+45
-31
DatasetCleanServiceImpl.java
...ce/webadmin/app/service/impl/DatasetCleanServiceImpl.java
+6
-1
DataCleanerUtil.java
...main/java/com/yice/webadmin/app/util/DataCleanerUtil.java
+39
-30
No files found.
application-webadmin/src/main/java/com/yice/webadmin/app/service/impl/DatasetCleanServiceImpl.java
View file @
74073a00
...
...
@@ -121,7 +121,6 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
return
datasetClean
;
}
@Transactional
(
rollbackFor
=
Exception
.
class
)
@Async
(
"taskExecutor"
)
public
Future
<
Void
>
executeCleanTaskAsync
(
List
<
DatasetData
>
dataList
,
Long
cleanId
,
Long
datasetId
)
{
asyncDealWithDatasetSaveBatch
(
dataList
,
cleanId
);
...
...
@@ -194,6 +193,12 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
if
(
null
!=
clean
){
doDatasetCleanHandler
(
clean
.
getDatasetId
(),
cleanId
);
}
//重新清洗
DatasetClean
filter
=
new
DatasetClean
();
filter
.
setCleanStatus
(
DatasetConstant
.
CLEAN_PROGRESS
);
filter
.
setFinishTime
(
null
);
filter
.
setCleanId
(
cleanId
);
this
.
updateById
(
filter
);
}
/**
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/util/DataCleanerUtil.java
View file @
74073a00
...
...
@@ -65,7 +65,7 @@ public class DataCleanerUtil {
data
=
data
.
replaceAll
(
"[\\x00-\\x1F\\x7F-\\x9F]"
,
""
);
break
;
case
DatasetCleanConstant
.
REPLACE_UNIFORM_WHITESPACE
:
data
=
data
.
replaceAll
(
"
[\\u0020\\u00A0\\u2000-\\u200A\\u2028\\u2029\\u3000]+
"
,
""
);
data
=
data
.
replaceAll
(
"
(?<![a-zA-Z])[\\u0020\\u00A0\\u2000-\\u200A\\u2028\\u2029\\u3000](?![a-zA-Z])
"
,
""
);
break
;
case
DatasetCleanConstant
.
REMOVE_NON_MEANING_CHARACTERS
:
data
=
data
.
replaceAll
(
"[\\p{Cntrl}\\p{Cn}]"
,
""
);
...
...
@@ -107,6 +107,12 @@ public class DataCleanerUtil {
return
data
;
}
public
static
void
main
(
String
[]
args
)
{
String
data
=
"我可以提供各种类型的帮助,包括回答问题、提供信息、解决$#问题、提供建议等。只要是我能力范围内^*的需求,我都会尽力帮助用户解决。"
;
data
=
filterSpecialCharacters
(
data
,
0.2
);
System
.
out
.
println
(
data
);
}
/**
* 计算阀值
* @param data 计算坏的数据
...
...
@@ -162,22 +168,25 @@ public class DataCleanerUtil {
Map
<
String
,
Integer
>
wordCountMap
=
new
HashMap
<>();
for
(
Term
term
:
termList
)
{
String
word
=
term
.
word
;
wordCountMap
.
put
(
word
,
wordCountMap
.
getOrDefault
(
word
,
0
)
+
1
);
// 忽略空格
if
(
StringUtils
.
isNotBlank
(
word
))
{
wordCountMap
.
put
(
word
,
wordCountMap
.
getOrDefault
(
word
,
0
)
+
1
);
}
}
// 计算总词数
int
totalWords
=
termList
.
size
();
// 找到出现次数最多的词
String
mostFrequentWord
=
wordCountMap
.
entrySet
().
stream
()
.
max
(
Comparator
.
comparingInt
(
Map
.
Entry
::
getValue
))
.
map
(
Map
.
Entry
::
getKey
)
.
orElse
(
null
);
// 计算重复词的次数(即出现次数大于1的词的总次数)
int
repeatedWordsCount
=
0
;
for
(
int
count
:
wordCountMap
.
values
())
{
if
(
count
>
1
)
{
// 只计算重复的次数
repeatedWordsCount
+=
count
-
1
;
}
// 如果没有词出现超过一次,直接返回原字符串
if
(
mostFrequentWord
==
null
)
{
return
document
;
}
// 计算词重复率
double
repetitionRate
=
(
double
)
repeatedWordsCount
/
totalWords
;
// 计算最大词的重复率
double
repetitionRate
=
(
double
)
(
wordCountMap
.
get
(
mostFrequentWord
)
-
1
)
/
(
double
)
termList
.
size
();
return
repetitionRate
>
threshold
?
DatasetConstant
.
EMPTY_STR
:
document
;
}
...
...
@@ -188,7 +197,6 @@ public class DataCleanerUtil {
* @param threshold 阀值
* @return 清洗后的数据
*/
public
static
String
filterWordRepetition
(
String
text
,
double
threshold
)
{
// 将文本转换为字符数组
char
[]
characters
=
text
.
toCharArray
();
...
...
@@ -196,25 +204,26 @@ public class DataCleanerUtil {
// 统计每个字的出现次数
Map
<
Character
,
Integer
>
characterCountMap
=
new
HashMap
<>();
for
(
char
c
:
characters
)
{
characterCountMap
.
put
(
c
,
characterCountMap
.
getOrDefault
(
c
,
0
)
+
1
);
// 忽略空白符
if
(!
Character
.
isWhitespace
(
c
))
{
characterCountMap
.
put
(
c
,
characterCountMap
.
getOrDefault
(
c
,
0
)
+
1
);
}
}
// 计算总字数
int
totalCharacters
=
characters
.
length
;
// 找到出现次数最多的字
Character
mostFrequentCharacter
=
characterCountMap
.
entrySet
().
stream
()
.
max
(
Comparator
.
comparingInt
(
Map
.
Entry
::
getValue
))
.
map
(
Map
.
Entry
::
getKey
)
.
orElse
(
null
);
// 计算重复字的次数(即出现次数大于1的字的总次数)
int
repeatedCharactersCount
=
0
;
for
(
int
count
:
characterCountMap
.
values
())
{
if
(
count
>
1
)
{
// 只计算重复的次数
repeatedCharactersCount
+=
count
-
1
;
}
// 如果没有字出现超过一次,直接返回原字符串
if
(
mostFrequentCharacter
==
null
)
{
return
text
;
}
// 计算
字
重复率
double
repetitionRate
=
(
double
)
repeatedCharactersCount
/
totalCharacters
;
// 计算
最大字的
重复率
double
repetitionRate
=
(
double
)
(
characterCountMap
.
get
(
mostFrequentCharacter
)
-
1
)
/
(
double
)
characters
.
length
;
//根据阀值判断进行数据返回
return
repetitionRate
>
threshold
?
DatasetConstant
.
EMPTY_STR
:
text
;
}
...
...
@@ -227,8 +236,8 @@ public class DataCleanerUtil {
private
static
String
filterSpecialCharacters
(
String
data
,
double
radio
)
{
StringBuffer
result
=
new
StringBuffer
();
double
specialCharacterRatio
=
calculateSpecialCharacterRatio
(
data
);
if
(
specialCharacterRatio
>
radio
)
{
result
.
append
(
data
.
replaceAll
(
"[#$
%
^&*()]"
,
""
));
if
(
specialCharacterRatio
<=
radio
)
{
result
.
append
(
data
.
replaceAll
(
"[#$^&*()]"
,
""
));
}
else
{
result
.
append
(
data
);
}
...
...
@@ -258,7 +267,7 @@ public class DataCleanerUtil {
* @param radio 最大词条目数据
* @return 返回截取后的字符串
*/
p
ublic
static
String
filterNumberWords
(
String
data
,
int
radio
)
{
p
rivate
static
String
filterNumberWords
(
String
data
,
int
radio
)
{
data
=
data
.
toLowerCase
();
if
(
data
.
length
()
<=
radio
)
{
return
data
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment