Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
L
lmp_server
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
lmp
lmp_server
Commits
32843a49
Commit
32843a49
authored
Apr 02, 2024
by
pengxin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
清洗数据新增规则。
parent
56c625a3
Changes
21
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
21 changed files
with
634 additions
and
53 deletions
+634
-53
pom.xml
application-webadmin/pom.xml
+5
-0
DatasetCleanConstant.java
.../com/yice/webadmin/app/constant/DatasetCleanConstant.java
+53
-0
DatasetConstant.java
.../java/com/yice/webadmin/app/constant/DatasetConstant.java
+21
-1
DatasetCleanController.java
.../yice/webadmin/app/controller/DatasetCleanController.java
+20
-8
DatasetVersionController.java
...ice/webadmin/app/controller/DatasetVersionController.java
+1
-1
DatasetDataDeduplicate.java
...va/com/yice/webadmin/app/data/DatasetDataDeduplicate.java
+5
-2
DatasetDataDesensitive.java
...va/com/yice/webadmin/app/data/DatasetDataDesensitive.java
+5
-2
DatasetCleanDto.java
.../main/java/com/yice/webadmin/app/dto/DatasetCleanDto.java
+7
-0
DatasetDataDeduplicateDto.java
.../com/yice/webadmin/app/dto/DatasetDataDeduplicateDto.java
+9
-3
DatasetDataDesensitiveDto.java
.../com/yice/webadmin/app/dto/DatasetDataDesensitiveDto.java
+9
-3
DatasetClean.java
...c/main/java/com/yice/webadmin/app/model/DatasetClean.java
+6
-0
DatasetCleanService.java
...va/com/yice/webadmin/app/service/DatasetCleanService.java
+7
-3
DatasetDataService.java
...ava/com/yice/webadmin/app/service/DatasetDataService.java
+9
-0
DatasetCleanConfigServiceImpl.java
...admin/app/service/impl/DatasetCleanConfigServiceImpl.java
+6
-4
DatasetCleanServiceImpl.java
...ce/webadmin/app/service/impl/DatasetCleanServiceImpl.java
+357
-20
DatasetDataServiceImpl.java
...ice/webadmin/app/service/impl/DatasetDataServiceImpl.java
+12
-0
DataCleanerUtil.java
...main/java/com/yice/webadmin/app/util/DataCleanerUtil.java
+53
-0
JsonNameExtractor.java
...in/java/com/yice/webadmin/app/util/JsonNameExtractor.java
+24
-0
DatasetCleanVo.java
...rc/main/java/com/yice/webadmin/app/vo/DatasetCleanVo.java
+7
-0
DatasetDataDeduplicateVo.java
...va/com/yice/webadmin/app/vo/DatasetDataDeduplicateVo.java
+9
-3
DatasetDataDesensitiveVo.java
...va/com/yice/webadmin/app/vo/DatasetDataDesensitiveVo.java
+9
-3
No files found.
application-webadmin/pom.xml
View file @
32843a49
...
@@ -65,6 +65,11 @@
...
@@ -65,6 +65,11 @@
<artifactId>
core
</artifactId>
<artifactId>
core
</artifactId>
<version>
3.4.1
</version>
<version>
3.4.1
</version>
</dependency>
</dependency>
<dependency>
<groupId>
com.github.houbb
</groupId>
<artifactId>
opencc4j
</artifactId>
<version>
1.6.2
</version>
</dependency>
</dependencies>
</dependencies>
<build>
<build>
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/constant/DatasetCleanConstant.java
0 → 100644
View file @
32843a49
package
com
.
yice
.
webadmin
.
app
.
constant
;
/**
* 数据清洗类常量类
*/
public
class
DatasetCleanConstant
{
/**
* 移除不可见字符
*/
public
static
final
String
REMOVE_INVISIBLE_CHARACTER
=
"remove_invisible_character"
;
/**
* 规范化空格
*/
public
static
final
String
REPLACE_UNIFORM_WHITESPACE
=
"replace_uniform_whitespace"
;
/**
* 去除乱码
*/
public
static
final
String
REMOVE_NON_MEANING_CHARACTERS
=
"remove_non_meaning_characters"
;
/**
* 繁体转简体
*/
public
static
final
String
REPLACE_TRADITIONAL_CHINESE_TO_SIMPLIFIED
=
"replace_traditional_chinese_to_simplified"
;
/**
* 去除网页标识符
*/
public
static
final
String
REMOVE_WEB_IDENTIFIERS
=
"remove_web_identifiers"
;
/**
* 去除表情
*/
public
static
final
String
REMOVE_EMOJI
=
"remove_emoji"
;
/**
* 去除Email
*/
public
static
final
String
REPLACE_EMAILS
=
"replace_emails"
;
/**
* 去除IP地址
*/
public
static
final
String
REPLACE_IP
=
"replace_ip"
;
/**
* 去除数字
*/
public
static
final
String
REPLACE_IDENTIFIER
=
"replace_identifier"
;
}
application-webadmin/src/main/java/com/yice/webadmin/app/constant/DatasetConstant.java
View file @
32843a49
...
@@ -22,6 +22,11 @@ public class DatasetConstant {
...
@@ -22,6 +22,11 @@ public class DatasetConstant {
*/
*/
public
static
final
Integer
UNMARK
=
0
;
public
static
final
Integer
UNMARK
=
0
;
/**
* 默认单次写入10000条数据
*/
public
static
final
Integer
MAX_SIZE
=
10000
;
/**
/**
* 已完成状态
* 已完成状态
*/
*/
...
@@ -57,6 +62,11 @@ public class DatasetConstant {
...
@@ -57,6 +62,11 @@ public class DatasetConstant {
*/
*/
public
static
final
String
OUTPUT
=
"output"
;
public
static
final
String
OUTPUT
=
"output"
;
/**
* data数据
*/
public
static
final
String
DATA
=
"data"
;
/**
/**
* 已标记
* 已标记
*/
*/
...
@@ -65,7 +75,17 @@ public class DatasetConstant {
...
@@ -65,7 +75,17 @@ public class DatasetConstant {
/**
/**
* 清洗中
* 清洗中
*/
*/
public
static
final
Integer
CLEAN_PROGRESS
=
1
;
public
static
final
Integer
CLEAN_PROGRESS
=
0
;
/**
* 清洗完成
*/
public
static
final
Integer
CLEAN_FINISHED
=
1
;
/**
* 文本数据清洗
*/
public
static
final
Integer
CLEAN_TYPE
=
1
;
/**
/**
* 分页个数
* 分页个数
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/controller/DatasetCleanController.java
View file @
32843a49
...
@@ -10,10 +10,8 @@ import com.yice.common.core.util.MyModelUtil;
...
@@ -10,10 +10,8 @@ import com.yice.common.core.util.MyModelUtil;
import
com.yice.common.core.util.MyPageUtil
;
import
com.yice.common.core.util.MyPageUtil
;
import
com.yice.common.log.annotation.OperationLog
;
import
com.yice.common.log.annotation.OperationLog
;
import
com.yice.common.log.model.constant.SysOperationLogType
;
import
com.yice.common.log.model.constant.SysOperationLogType
;
import
com.yice.webadmin.app.dto.DatasetCleanConfigDto
;
import
com.yice.webadmin.app.dto.DatasetCleanDto
;
import
com.yice.webadmin.app.dto.DatasetCleanDto
;
import
com.yice.webadmin.app.model.DatasetClean
;
import
com.yice.webadmin.app.model.DatasetClean
;
import
com.yice.webadmin.app.model.DatasetCleanConfig
;
import
com.yice.webadmin.app.service.DatasetCleanService
;
import
com.yice.webadmin.app.service.DatasetCleanService
;
import
com.yice.webadmin.app.vo.DatasetCleanVo
;
import
com.yice.webadmin.app.vo.DatasetCleanVo
;
import
io.swagger.annotations.Api
;
import
io.swagger.annotations.Api
;
...
@@ -63,21 +61,35 @@ public class DatasetCleanController {
...
@@ -63,21 +61,35 @@ public class DatasetCleanController {
* @param datasetCleanDto 新增对象。
* @param datasetCleanDto 新增对象。
* @return 应答结果对象,包含新增对象主键Id。
* @return 应答结果对象,包含新增对象主键Id。
*/
*/
@ApiOperationSupport
(
ignoreParameters
=
{
"datasetCleanDto.cleanId"
,
"datasetCleanConfigDto.cleanConfigId"
})
@ApiOperationSupport
(
ignoreParameters
=
{
"datasetCleanDto.cleanId"
})
@OperationLog
(
type
=
SysOperationLogType
.
ADD_ALL
)
@OperationLog
(
type
=
SysOperationLogType
.
ADD_ALL
)
@PostMapping
(
"/addAll"
)
@PostMapping
(
"/startClean"
)
public
ResponseResult
<
DatasetClean
>
addAll
(
@MyRequestBody
DatasetCleanDto
datasetCleanDto
,
public
ResponseResult
<
DatasetClean
>
startClean
(
@MyRequestBody
DatasetCleanDto
datasetCleanDto
)
{
@MyRequestBody
DatasetCleanConfigDto
datasetCleanConfigDto
)
{
String
errorMessage
=
MyCommonUtil
.
getModelValidationError
(
datasetCleanDto
,
false
);
String
errorMessage
=
MyCommonUtil
.
getModelValidationError
(
datasetCleanDto
,
false
);
if
(
errorMessage
!=
null
)
{
if
(
errorMessage
!=
null
)
{
return
ResponseResult
.
error
(
ErrorCodeEnum
.
DATA_VALIDATED_FAILED
,
errorMessage
);
return
ResponseResult
.
error
(
ErrorCodeEnum
.
DATA_VALIDATED_FAILED
,
errorMessage
);
}
}
DatasetClean
datasetClean
=
MyModelUtil
.
copyTo
(
datasetCleanDto
,
DatasetClean
.
class
);
DatasetClean
datasetClean
=
MyModelUtil
.
copyTo
(
datasetCleanDto
,
DatasetClean
.
class
);
DatasetCleanConfig
datasetCleanConfig
=
MyModelUtil
.
copyTo
(
datasetCleanConfigDto
,
DatasetCleanConfig
.
class
);
datasetClean
=
datasetCleanService
.
addNew
(
datasetClean
);
datasetClean
=
datasetCleanService
.
saveNew
(
datasetClean
,
datasetCleanConfig
);
return
ResponseResult
.
success
(
datasetClean
);
return
ResponseResult
.
success
(
datasetClean
);
}
}
/**
* 停止数据集清洗数据。
*
* @param cleanId 新增对象。
* @return 应答结果对象,包含新增对象主键Id。
*/
@OperationLog
(
type
=
SysOperationLogType
.
DELETE
)
@PostMapping
(
"/stopClean"
)
public
ResponseResult
<
Void
>
stopClean
(
@RequestParam
Long
cleanId
)
{
if
(
MyCommonUtil
.
existBlankArgument
(
cleanId
))
{
return
ResponseResult
.
error
(
ErrorCodeEnum
.
ARGUMENT_NULL_EXIST
);
}
datasetCleanService
.
stopCleanTask
(
cleanId
);
return
ResponseResult
.
success
();
}
/**
/**
* 更新数据集清洗数据。
* 更新数据集清洗数据。
*
*
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/controller/DatasetVersionController.java
View file @
32843a49
...
@@ -356,7 +356,7 @@ public class DatasetVersionController {
...
@@ -356,7 +356,7 @@ public class DatasetVersionController {
return
ResponseResult
.
error
(
ErrorCodeEnum
.
ARGUMENT_NULL_EXIST
,
errorMessage
);
return
ResponseResult
.
error
(
ErrorCodeEnum
.
ARGUMENT_NULL_EXIST
,
errorMessage
);
}
}
DatasetVersion
datasetVersion
=
this
.
datasetVersionService
.
getById
(
versionId
);
DatasetVersion
datasetVersion
=
this
.
datasetVersionService
.
getById
(
versionId
);
String
versionName
=
datasetVersion
.
getVersionName
();
String
versionName
=
datasetVersion
.
getVersionName
()
+
"_V"
+
datasetVersion
.
getDatasetVersion
()
;
//先存储文件
//先存储文件
String
fullName
=
this
.
saveDatasetFile
(
importFile
,
versionName
,
versionId
);
String
fullName
=
this
.
saveDatasetFile
(
importFile
,
versionName
,
versionId
);
//再存储数据集配置文件
//再存储数据集配置文件
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/data/DatasetDataDeduplicate.java
View file @
32843a49
...
@@ -24,8 +24,11 @@ public class DatasetDataDeduplicate {
...
@@ -24,8 +24,11 @@ public class DatasetDataDeduplicate {
@ApiModelProperty
(
name
=
"clean_id"
,
value
=
"清洗任务标识id"
)
@ApiModelProperty
(
name
=
"clean_id"
,
value
=
"清洗任务标识id"
)
private
Long
cleanId
;
private
Long
cleanId
;
@ApiModelProperty
(
name
=
"content"
,
value
=
"去重内容"
)
@ApiModelProperty
(
name
=
"clean_before_data"
,
value
=
"清洗前数据"
)
private
String
content
;
private
String
cleanBeforeData
;
@ApiModelProperty
(
name
=
"clean_after_data"
,
value
=
"清洗后数据"
)
private
String
cleanAfterData
;
@ApiModelProperty
(
name
=
"create_time"
,
value
=
"创建时间"
)
@ApiModelProperty
(
name
=
"create_time"
,
value
=
"创建时间"
)
private
Date
createTime
;
private
Date
createTime
;
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/data/DatasetDataDesensitive.java
View file @
32843a49
...
@@ -24,8 +24,11 @@ public class DatasetDataDesensitive {
...
@@ -24,8 +24,11 @@ public class DatasetDataDesensitive {
@ApiModelProperty
(
name
=
"clean_id"
,
value
=
"清洗任务标识id"
)
@ApiModelProperty
(
name
=
"clean_id"
,
value
=
"清洗任务标识id"
)
private
Long
cleanId
;
private
Long
cleanId
;
@ApiModelProperty
(
name
=
"content"
,
value
=
"去隐私内容"
)
@ApiModelProperty
(
name
=
"clean_before_data"
,
value
=
"清洗前数据"
)
private
String
content
;
private
String
cleanBeforeData
;
@ApiModelProperty
(
name
=
"clean_after_data"
,
value
=
"清洗后数据"
)
private
String
cleanAfterData
;
@ApiModelProperty
(
name
=
"create_time"
,
value
=
"创建时间"
)
@ApiModelProperty
(
name
=
"create_time"
,
value
=
"创建时间"
)
private
Date
createTime
;
private
Date
createTime
;
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/dto/DatasetCleanDto.java
View file @
32843a49
package
com
.
yice
.
webadmin
.
app
.
dto
;
package
com
.
yice
.
webadmin
.
app
.
dto
;
import
com.yice.webadmin.app.model.DatasetCleanConfig
;
import
io.swagger.annotations.ApiModel
;
import
io.swagger.annotations.ApiModel
;
import
io.swagger.annotations.ApiModelProperty
;
import
io.swagger.annotations.ApiModelProperty
;
import
lombok.Data
;
import
lombok.Data
;
...
@@ -36,6 +37,12 @@ public class DatasetCleanDto {
...
@@ -36,6 +37,12 @@ public class DatasetCleanDto {
@ApiModelProperty
(
value
=
"清洗数据集名称"
)
@ApiModelProperty
(
value
=
"清洗数据集名称"
)
private
String
datasetName
;
private
String
datasetName
;
/**
* 清洗配置对象。
*/
@ApiModelProperty
(
value
=
"清洗配置对象"
)
private
DatasetCleanConfig
config
;
/**
/**
* 清洗方式。
* 清洗方式。
*/
*/
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/dto/DatasetDataDeduplicateDto.java
View file @
32843a49
...
@@ -29,10 +29,16 @@ public class DatasetDataDeduplicateDto {
...
@@ -29,10 +29,16 @@ public class DatasetDataDeduplicateDto {
private
Long
cleanId
;
private
Long
cleanId
;
/**
/**
*
过滤内容
。
*
清洗前数据
。
*/
*/
@ApiModelProperty
(
value
=
"过滤内容"
)
@ApiModelProperty
(
value
=
"清洗前数据"
)
private
String
content
;
private
String
cleanBeforeData
;
/**
* 清洗后数据。
*/
@ApiModelProperty
(
value
=
"清洗后数据"
)
private
String
cleanAfterData
;
/**
/**
* 创建时间。
* 创建时间。
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/dto/DatasetDataDesensitiveDto.java
View file @
32843a49
...
@@ -29,10 +29,16 @@ public class DatasetDataDesensitiveDto {
...
@@ -29,10 +29,16 @@ public class DatasetDataDesensitiveDto {
private
Long
cleanId
;
private
Long
cleanId
;
/**
/**
*
过滤内容
。
*
清洗前数据
。
*/
*/
@ApiModelProperty
(
value
=
"过滤内容"
)
@ApiModelProperty
(
value
=
"清洗前数据"
)
private
String
content
;
private
String
cleanBeforeData
;
/**
* 清洗后数据。
*/
@ApiModelProperty
(
value
=
"清洗后数据"
)
private
String
cleanAfterData
;
/**
/**
* 创建时间。
* 创建时间。
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/model/DatasetClean.java
View file @
32843a49
...
@@ -80,6 +80,12 @@ public class DatasetClean extends BaseModel {
...
@@ -80,6 +80,12 @@ public class DatasetClean extends BaseModel {
@TableField
(
exist
=
false
)
@TableField
(
exist
=
false
)
private
List
<
String
>
cleanMethod
;
private
List
<
String
>
cleanMethod
;
/**
* 清洗配置对象。
*/
@TableField
(
exist
=
false
)
private
DatasetCleanConfig
config
;
/**
/**
* 创建人名称字典。
* 创建人名称字典。
*/
*/
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/service/DatasetCleanService.java
View file @
32843a49
...
@@ -2,7 +2,6 @@ package com.yice.webadmin.app.service;
...
@@ -2,7 +2,6 @@ package com.yice.webadmin.app.service;
import
com.yice.common.core.base.service.IBaseService
;
import
com.yice.common.core.base.service.IBaseService
;
import
com.yice.webadmin.app.model.DatasetClean
;
import
com.yice.webadmin.app.model.DatasetClean
;
import
com.yice.webadmin.app.model.DatasetCleanConfig
;
import
java.util.List
;
import
java.util.List
;
...
@@ -22,14 +21,19 @@ public interface DatasetCleanService extends IBaseService<DatasetClean, Long> {
...
@@ -22,14 +21,19 @@ public interface DatasetCleanService extends IBaseService<DatasetClean, Long> {
*/
*/
DatasetClean
saveNew
(
DatasetClean
datasetClean
);
DatasetClean
saveNew
(
DatasetClean
datasetClean
);
/**
* 停止清洗任务
* @param cleanId 清洗任务id
*/
void
stopCleanTask
(
Long
cleanId
);
/**
/**
* 保存清洗对象以及清洗配置对象。
* 保存清洗对象以及清洗配置对象。
*
*
* @param datasetClean 新增对象。
* @param datasetClean 新增对象。
* @param datasetCleanConfig 新增配置对象。
* @return 返回新增对象。
* @return 返回新增对象。
*/
*/
DatasetClean
saveNew
(
DatasetClean
datasetClean
,
DatasetCleanConfig
datasetCleanConfig
);
DatasetClean
addNew
(
DatasetClean
datasetClean
);
/**
/**
* 利用数据库的insertList语法,批量插入对象列表。
* 利用数据库的insertList语法,批量插入对象列表。
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/service/DatasetDataService.java
View file @
32843a49
...
@@ -78,6 +78,15 @@ public interface DatasetDataService {
...
@@ -78,6 +78,15 @@ public interface DatasetDataService {
*/
*/
void
update
(
DatasetData
datasetData
);
void
update
(
DatasetData
datasetData
);
/**
* 批量处理数据集列表。
*
* @param dataList 批量处理数据集列表。
* @param versionId 版本标识
* @return 返回修改后的对象。
*/
void
updateBatch
(
List
<
DatasetData
>
dataList
,
Long
versionId
);
/**
/**
* 删除指定数据。
* 删除指定数据。
*
*
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/service/impl/DatasetCleanConfigServiceImpl.java
View file @
32843a49
...
@@ -264,7 +264,8 @@ public class DatasetCleanConfigServiceImpl extends BaseService<DatasetCleanConfi
...
@@ -264,7 +264,8 @@ public class DatasetCleanConfigServiceImpl extends BaseService<DatasetCleanConfi
List
<
Document
>
documents
=
new
ArrayList
<>();
List
<
Document
>
documents
=
new
ArrayList
<>();
if
(
CollUtil
.
isNotEmpty
(
deduplicates
))
{
if
(
CollUtil
.
isNotEmpty
(
deduplicates
))
{
for
(
DatasetDataDeduplicate
deduplicate
:
deduplicates
)
{
for
(
DatasetDataDeduplicate
deduplicate
:
deduplicates
)
{
Document
document
=
new
Document
(
MongoConstant
.
CONTENT
,
deduplicate
.
getContent
())
Document
document
=
new
Document
(
MongoConstant
.
CLEAN_BEFORE_DATA
,
deduplicate
.
getCleanBeforeData
())
.
append
(
MongoConstant
.
CLEAN_AFTER_DATA
,
deduplicate
.
getCleanAfterData
())
.
append
(
MongoConstant
.
CLEAN_ID
,
deduplicate
.
getCleanId
())
.
append
(
MongoConstant
.
CLEAN_ID
,
deduplicate
.
getCleanId
())
.
append
(
MongoConstant
.
CREATE_TIME
,
new
Date
());
.
append
(
MongoConstant
.
CREATE_TIME
,
new
Date
());
documents
.
add
(
document
);
documents
.
add
(
document
);
...
@@ -283,9 +284,10 @@ public class DatasetCleanConfigServiceImpl extends BaseService<DatasetCleanConfi
...
@@ -283,9 +284,10 @@ public class DatasetCleanConfigServiceImpl extends BaseService<DatasetCleanConfi
public
void
saveDatasetDesensitive
(
List
<
DatasetDataDesensitive
>
desensitives
)
{
public
void
saveDatasetDesensitive
(
List
<
DatasetDataDesensitive
>
desensitives
)
{
List
<
Document
>
documents
=
new
ArrayList
<>();
List
<
Document
>
documents
=
new
ArrayList
<>();
if
(
CollUtil
.
isNotEmpty
(
desensitives
))
{
if
(
CollUtil
.
isNotEmpty
(
desensitives
))
{
for
(
DatasetDataDesensitive
dataDesensitive
:
desensitives
)
{
for
(
DatasetDataDesensitive
desensitive
:
desensitives
)
{
Document
document
=
new
Document
(
MongoConstant
.
CONTENT
,
dataDesensitive
.
getContent
())
Document
document
=
new
Document
(
MongoConstant
.
CLEAN_BEFORE_DATA
,
desensitive
.
getCleanBeforeData
())
.
append
(
MongoConstant
.
CLEAN_ID
,
dataDesensitive
.
getCleanId
())
.
append
(
MongoConstant
.
CLEAN_AFTER_DATA
,
desensitive
.
getCleanAfterData
())
.
append
(
MongoConstant
.
CLEAN_ID
,
desensitive
.
getCleanId
())
.
append
(
MongoConstant
.
CREATE_TIME
,
new
Date
());
.
append
(
MongoConstant
.
CREATE_TIME
,
new
Date
());
documents
.
add
(
document
);
documents
.
add
(
document
);
}
}
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/service/impl/DatasetCleanServiceImpl.java
View file @
32843a49
This diff is collapsed.
Click to expand it.
application-webadmin/src/main/java/com/yice/webadmin/app/service/impl/DatasetDataServiceImpl.java
View file @
32843a49
...
@@ -148,6 +148,18 @@ public class DatasetDataServiceImpl implements DatasetDataService {
...
@@ -148,6 +148,18 @@ public class DatasetDataServiceImpl implements DatasetDataService {
MongoConstant
.
COLLECT_NAME
+
datasetData
.
getVersionId
());
MongoConstant
.
COLLECT_NAME
+
datasetData
.
getVersionId
());
}
}
/**
* 更新数据对象。
*
* @param dataList 更新的对象。
* @param versionId 更新的对象。
* @return 成功返回true,否则false。
*/
@Override
public
void
updateBatch
(
List
<
DatasetData
>
dataList
,
Long
versionId
)
{
mongoTemplate
.
save
(
dataList
,
MongoConstant
.
COLLECT_NAME
+
versionId
);
}
/**
/**
* 删除指定数据。
* 删除指定数据。
*
*
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/util/DataCleanerUtil.java
0 → 100644
View file @
32843a49
package
com
.
yice
.
webadmin
.
app
.
util
;
import
com.github.houbb.opencc4j.util.ZhConverterUtil
;
import
com.yice.webadmin.app.constant.DatasetCleanConstant
;
import
lombok.extern.slf4j.Slf4j
;
import
java.util.List
;
@Slf4j
public
class
DataCleanerUtil
{
/**
* 定义清洗后的数据
*
* @param data 清洗数据
* @param rules 清洗规则
* @return 返回清洗后的数据
*/
public
static
String
buildCleanAfterData
(
String
data
,
List
<
String
>
rules
)
{
for
(
String
rule
:
rules
)
{
switch
(
rule
)
{
case
DatasetCleanConstant
.
REMOVE_INVISIBLE_CHARACTER
:
data
=
data
.
replaceAll
(
"[\\p{C}]"
,
" "
);
break
;
case
DatasetCleanConstant
.
REPLACE_UNIFORM_WHITESPACE
:
data
=
data
.
replaceAll
(
"[\\p{Cs}\\p{Co}\\p{Cn}]"
,
""
);
break
;
case
DatasetCleanConstant
.
REMOVE_NON_MEANING_CHARACTERS
:
data
=
data
.
replaceAll
(
"[\\p{Z}\\u2000-\\u200A\\u2028\\u2029\\u3000]"
,
""
);
break
;
case
DatasetCleanConstant
.
REPLACE_TRADITIONAL_CHINESE_TO_SIMPLIFIED
:
data
=
ZhConverterUtil
.
toSimple
(
data
);
break
;
case
DatasetCleanConstant
.
REMOVE_WEB_IDENTIFIERS
:
data
=
data
.
replaceAll
(
"<[^>]*>"
,
""
);
break
;
case
DatasetCleanConstant
.
REMOVE_EMOJI
:
data
=
data
.
replaceAll
(
"[\\ud83c[\\udffb-\\udfff]|\\ud83d[\\udc00-\\ude4f]|\\ud83d[\\ude80-\\udeff]|\\ud83e[\\udd10-\\uddff]]"
,
""
);
break
;
case
DatasetCleanConstant
.
REPLACE_EMAILS
:
data
=
data
.
replaceAll
(
"\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b"
,
"EMAIL"
);
break
;
case
DatasetCleanConstant
.
REPLACE_IP
:
data
=
data
.
replaceAll
(
"\\b(25[0-5]\\.|2[0-4][0-9]\\.|[01]?[0-9][0-9]?\\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\b"
,
"IP_ADDRESS"
);
break
;
case
DatasetCleanConstant
.
REPLACE_IDENTIFIER
:
data
=
data
.
replaceAll
(
"\\d+"
,
"PI:KEY"
);
break
;
}
}
return
data
;
}
}
application-webadmin/src/main/java/com/yice/webadmin/app/util/JsonNameExtractor.java
View file @
32843a49
...
@@ -38,4 +38,28 @@ public class JsonNameExtractor {
...
@@ -38,4 +38,28 @@ public class JsonNameExtractor {
return
names
;
return
names
;
}
}
/**
* 拼接名字
* @param rule json名字
* @return 名字列表
*/
public
static
List
<
String
>
extractNames
(
String
rule
)
{
ObjectMapper
mapper
=
new
ObjectMapper
();
List
<
String
>
names
=
new
ArrayList
<>();
try
{
JsonNode
rootNode
=
mapper
.
readTree
(
rule
);
if
(
rootNode
.
isArray
())
{
for
(
JsonNode
jsonNode
:
rootNode
)
{
JsonNode
nameNode
=
jsonNode
.
get
(
"name"
);
if
(
nameNode
!=
null
&&
nameNode
.
isTextual
())
{
names
.
add
(
nameNode
.
asText
());
}
}
}
}
catch
(
IOException
e
)
{
log
.
error
(
"extract name method overload is error"
,
e
);
}
return
names
;
}
}
}
application-webadmin/src/main/java/com/yice/webadmin/app/vo/DatasetCleanVo.java
View file @
32843a49
package
com
.
yice
.
webadmin
.
app
.
vo
;
package
com
.
yice
.
webadmin
.
app
.
vo
;
import
com.yice.webadmin.app.model.DatasetCleanConfig
;
import
io.swagger.annotations.ApiModel
;
import
io.swagger.annotations.ApiModel
;
import
io.swagger.annotations.ApiModelProperty
;
import
io.swagger.annotations.ApiModelProperty
;
import
lombok.Data
;
import
lombok.Data
;
...
@@ -42,6 +43,12 @@ public class DatasetCleanVo {
...
@@ -42,6 +43,12 @@ public class DatasetCleanVo {
@ApiModelProperty
(
value
=
"清洗数据集名称"
)
@ApiModelProperty
(
value
=
"清洗数据集名称"
)
private
String
datasetName
;
private
String
datasetName
;
/**
* 清洗配置对象。
*/
@ApiModelProperty
(
value
=
"清洗配置对象"
)
private
DatasetCleanConfig
config
;
/**
/**
* 开始时间。
* 开始时间。
*/
*/
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/vo/DatasetDataDeduplicateVo.java
View file @
32843a49
...
@@ -29,10 +29,16 @@ public class DatasetDataDeduplicateVo {
...
@@ -29,10 +29,16 @@ public class DatasetDataDeduplicateVo {
private
Long
cleanId
;
private
Long
cleanId
;
/**
/**
*
过滤内容
。
*
清洗前数据
。
*/
*/
@ApiModelProperty
(
value
=
"过滤内容"
)
@ApiModelProperty
(
value
=
"清洗前数据"
)
private
String
content
;
private
String
cleanBeforeData
;
/**
* 清洗后数据。
*/
@ApiModelProperty
(
value
=
"清洗后数据"
)
private
String
cleanAfterData
;
/**
/**
* 创建时间。
* 创建时间。
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/vo/DatasetDataDesensitiveVo.java
View file @
32843a49
...
@@ -29,10 +29,16 @@ public class DatasetDataDesensitiveVo {
...
@@ -29,10 +29,16 @@ public class DatasetDataDesensitiveVo {
private
Long
cleanId
;
private
Long
cleanId
;
/**
/**
*
过滤内容
。
*
清洗前数据
。
*/
*/
@ApiModelProperty
(
value
=
"过滤内容"
)
@ApiModelProperty
(
value
=
"清洗前数据"
)
private
String
content
;
private
String
cleanBeforeData
;
/**
* 清洗后数据。
*/
@ApiModelProperty
(
value
=
"清洗后数据"
)
private
String
cleanAfterData
;
/**
/**
* 创建时间。
* 创建时间。
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment