Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
L
lmp_server
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
lmp
lmp_server
Commits
0634e753
Commit
0634e753
authored
Apr 02, 2024
by
pengxin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
添加数据清洗功能。
parent
32843a49
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
203 additions
and
122 deletions
+203
-122
DatasetData.java
...src/main/java/com/yice/webadmin/app/data/DatasetData.java
+20
-8
DatasetDataClean.java
...ain/java/com/yice/webadmin/app/data/DatasetDataClean.java
+20
-8
DatasetDataDeduplicate.java
...va/com/yice/webadmin/app/data/DatasetDataDeduplicate.java
+20
-5
DatasetDataDesensitive.java
...va/com/yice/webadmin/app/data/DatasetDataDesensitive.java
+20
-8
DatasetDataFilter.java
...in/java/com/yice/webadmin/app/data/DatasetDataFilter.java
+16
-7
DatasetCleanServiceImpl.java
...ce/webadmin/app/service/impl/DatasetCleanServiceImpl.java
+89
-85
DatasetDataServiceImpl.java
...ice/webadmin/app/service/impl/DatasetDataServiceImpl.java
+18
-1
No files found.
application-webadmin/src/main/java/com/yice/webadmin/app/data/DatasetData.java
View file @
0634e753
package
com
.
yice
.
webadmin
.
app
.
data
;
package
com
.
yice
.
webadmin
.
app
.
data
;
import
io.swagger.annotations.ApiModel
;
import
io.swagger.annotations.ApiModelProperty
;
import
lombok.AllArgsConstructor
;
import
lombok.AllArgsConstructor
;
import
lombok.Data
;
import
lombok.Data
;
import
lombok.NoArgsConstructor
;
import
lombok.NoArgsConstructor
;
import
org.springframework.data.annotation.Id
;
import
org.springframework.data.annotation.Id
;
import
org.springframework.data.mongodb.core.mapping.Document
;
import
org.springframework.data.mongodb.core.mapping.Document
;
import
org.springframework.data.mongodb.core.mapping.Field
;
import
java.util.Date
;
import
java.util.Date
;
@Data
@Data
@ApiModel
@NoArgsConstructor
@NoArgsConstructor
@AllArgsConstructor
@AllArgsConstructor
@Document
(
collection
=
"dataset_data"
)
@Document
(
collection
=
"dataset_data"
)
public
class
DatasetData
{
public
class
DatasetData
{
/**
* 文档标识
*/
@Id
@Id
@ApiModelProperty
(
name
=
"_id"
,
value
=
"文档标识"
)
private
String
id
;
private
String
id
;
@ApiModelProperty
(
name
=
"version_id"
,
value
=
"版本标识"
)
/**
* 版本标识
*/
@Field
(
"version_id"
)
private
Long
versionId
;
private
Long
versionId
;
@ApiModelProperty
(
name
=
"data"
,
value
=
"json格式数据"
)
/**
* json格式数据
*/
@Field
(
"data"
)
private
String
data
;
private
String
data
;
@ApiModelProperty
(
name
=
"create_time"
,
value
=
"创建时间"
)
/**
* 创建时间
*/
@Field
(
"create_time"
)
private
Date
createTime
;
private
Date
createTime
;
@ApiModelProperty
(
name
=
"mark_status"
,
value
=
"标记状态"
)
/**
* 标记状态
*/
@Field
(
"mark_status"
)
private
Integer
markStatus
;
private
Integer
markStatus
;
}
}
application-webadmin/src/main/java/com/yice/webadmin/app/data/DatasetDataClean.java
View file @
0634e753
package
com
.
yice
.
webadmin
.
app
.
data
;
package
com
.
yice
.
webadmin
.
app
.
data
;
import
io.swagger.annotations.ApiModel
;
import
io.swagger.annotations.ApiModelProperty
;
import
lombok.AllArgsConstructor
;
import
lombok.AllArgsConstructor
;
import
lombok.Data
;
import
lombok.Data
;
import
lombok.NoArgsConstructor
;
import
lombok.NoArgsConstructor
;
import
org.springframework.data.annotation.Id
;
import
org.springframework.data.annotation.Id
;
import
org.springframework.data.mongodb.core.mapping.Document
;
import
org.springframework.data.mongodb.core.mapping.Document
;
import
org.springframework.data.mongodb.core.mapping.Field
;
import
java.util.Date
;
import
java.util.Date
;
@Data
@Data
@ApiModel
@NoArgsConstructor
@NoArgsConstructor
@AllArgsConstructor
@AllArgsConstructor
@Document
(
collection
=
"dataset_data_clean"
)
@Document
(
collection
=
"dataset_data_clean"
)
public
class
DatasetDataClean
{
public
class
DatasetDataClean
{
/**
* 文档标识
*/
@Id
@Id
@ApiModelProperty
(
name
=
"_id"
,
value
=
"文档标识"
)
private
String
id
;
private
String
id
;
@ApiModelProperty
(
name
=
"clean_id"
,
value
=
"清洗任务标识id"
)
/**
* 清洗任务标识id
*/
@Field
(
"clean_id"
)
private
Long
cleanId
;
private
Long
cleanId
;
@ApiModelProperty
(
name
=
"clean_before_data"
,
value
=
"清洗前数据"
)
/**
* 清洗前数据
*/
@Field
(
"clean_before_data"
)
private
String
cleanBeforeData
;
private
String
cleanBeforeData
;
@ApiModelProperty
(
name
=
"clean_after_data"
,
value
=
"清洗后数据"
)
/**
* 清洗后数据
*/
@Field
(
"clean_after_data"
)
private
String
cleanAfterData
;
private
String
cleanAfterData
;
@ApiModelProperty
(
name
=
"create_time"
,
value
=
"创建时间"
)
/**
* 创建时间
*/
@Field
(
"create_time"
)
private
Date
createTime
;
private
Date
createTime
;
}
}
application-webadmin/src/main/java/com/yice/webadmin/app/data/DatasetDataDeduplicate.java
View file @
0634e753
...
@@ -7,6 +7,7 @@ import lombok.Data;
...
@@ -7,6 +7,7 @@ import lombok.Data;
import
lombok.NoArgsConstructor
;
import
lombok.NoArgsConstructor
;
import
org.springframework.data.annotation.Id
;
import
org.springframework.data.annotation.Id
;
import
org.springframework.data.mongodb.core.mapping.Document
;
import
org.springframework.data.mongodb.core.mapping.Document
;
import
org.springframework.data.mongodb.core.mapping.Field
;
import
java.util.Date
;
import
java.util.Date
;
...
@@ -17,19 +18,33 @@ import java.util.Date;
...
@@ -17,19 +18,33 @@ import java.util.Date;
@Document
(
collection
=
"dataset_data_deduplicate"
)
@Document
(
collection
=
"dataset_data_deduplicate"
)
public
class
DatasetDataDeduplicate
{
public
class
DatasetDataDeduplicate
{
/**
* 文档标识
*/
@Id
@Id
@ApiModelProperty
(
name
=
"_id"
,
value
=
"文档标识"
)
private
String
id
;
private
String
id
;
@ApiModelProperty
(
name
=
"clean_id"
,
value
=
"清洗任务标识id"
)
/**
* 清洗任务标识id
*/
@Field
(
"clean_id"
)
private
Long
cleanId
;
private
Long
cleanId
;
@ApiModelProperty
(
name
=
"clean_before_data"
,
value
=
"清洗前数据"
)
/**
* 清洗前数据
*/
@Field
(
"clean_before_data"
)
private
String
cleanBeforeData
;
private
String
cleanBeforeData
;
@ApiModelProperty
(
name
=
"clean_after_data"
,
value
=
"清洗后数据"
)
/**
* 清洗后数据
*/
@Field
(
"clean_after_data"
)
private
String
cleanAfterData
;
private
String
cleanAfterData
;
@ApiModelProperty
(
name
=
"create_time"
,
value
=
"创建时间"
)
/**
* 创建时间
*/
@Field
(
"create_time"
)
private
Date
createTime
;
private
Date
createTime
;
}
}
application-webadmin/src/main/java/com/yice/webadmin/app/data/DatasetDataDesensitive.java
View file @
0634e753
package
com
.
yice
.
webadmin
.
app
.
data
;
package
com
.
yice
.
webadmin
.
app
.
data
;
import
io.swagger.annotations.ApiModel
;
import
io.swagger.annotations.ApiModelProperty
;
import
lombok.AllArgsConstructor
;
import
lombok.AllArgsConstructor
;
import
lombok.Data
;
import
lombok.Data
;
import
lombok.NoArgsConstructor
;
import
lombok.NoArgsConstructor
;
import
org.springframework.data.annotation.Id
;
import
org.springframework.data.annotation.Id
;
import
org.springframework.data.mongodb.core.mapping.Document
;
import
org.springframework.data.mongodb.core.mapping.Document
;
import
org.springframework.data.mongodb.core.mapping.Field
;
import
java.util.Date
;
import
java.util.Date
;
@Data
@Data
@ApiModel
@NoArgsConstructor
@NoArgsConstructor
@AllArgsConstructor
@AllArgsConstructor
@Document
(
collection
=
"dataset_data_desensitive"
)
@Document
(
collection
=
"dataset_data_desensitive"
)
public
class
DatasetDataDesensitive
{
public
class
DatasetDataDesensitive
{
/**
* 文档标识
*/
@Id
@Id
@ApiModelProperty
(
name
=
"_id"
,
value
=
"文档标识"
)
private
String
id
;
private
String
id
;
@ApiModelProperty
(
name
=
"clean_id"
,
value
=
"清洗任务标识id"
)
/**
* 清洗任务标识id
*/
@Field
(
"clean_id"
)
private
Long
cleanId
;
private
Long
cleanId
;
@ApiModelProperty
(
name
=
"clean_before_data"
,
value
=
"清洗前数据"
)
/**
* 清洗前数据
*/
@Field
(
"clean_before_data"
)
private
String
cleanBeforeData
;
private
String
cleanBeforeData
;
@ApiModelProperty
(
name
=
"clean_after_data"
,
value
=
"清洗后数据"
)
/**
* 清洗后数据
*/
@Field
(
"clean_after_data"
)
private
String
cleanAfterData
;
private
String
cleanAfterData
;
@ApiModelProperty
(
name
=
"create_time"
,
value
=
"创建时间"
)
/**
* 创建时间
*/
@Field
(
"create_time"
)
private
Date
createTime
;
private
Date
createTime
;
}
}
application-webadmin/src/main/java/com/yice/webadmin/app/data/DatasetDataFilter.java
View file @
0634e753
package
com
.
yice
.
webadmin
.
app
.
data
;
package
com
.
yice
.
webadmin
.
app
.
data
;
import
io.swagger.annotations.ApiModel
;
import
io.swagger.annotations.ApiModelProperty
;
import
lombok.AllArgsConstructor
;
import
lombok.AllArgsConstructor
;
import
lombok.Data
;
import
lombok.Data
;
import
lombok.NoArgsConstructor
;
import
lombok.NoArgsConstructor
;
import
org.springframework.data.annotation.Id
;
import
org.springframework.data.annotation.Id
;
import
org.springframework.data.mongodb.core.mapping.Document
;
import
org.springframework.data.mongodb.core.mapping.Document
;
import
org.springframework.data.mongodb.core.mapping.Field
;
import
java.util.Date
;
import
java.util.Date
;
@Data
@Data
@ApiModel
@NoArgsConstructor
@NoArgsConstructor
@AllArgsConstructor
@AllArgsConstructor
@Document
(
collection
=
"dataset_data_filter"
)
@Document
(
collection
=
"dataset_data_filter"
)
public
class
DatasetDataFilter
{
public
class
DatasetDataFilter
{
/**
* 文档标识
*/
@Id
@Id
@ApiModelProperty
(
name
=
"_id"
,
value
=
"文档标识"
)
private
String
id
;
private
String
id
;
@ApiModelProperty
(
name
=
"clean_id"
,
value
=
"清洗任务标识id"
)
/**
* 清洗任务标识id
*/
@Field
(
"clean_id"
)
private
Long
cleanId
;
private
Long
cleanId
;
@ApiModelProperty
(
name
=
"content"
,
value
=
"过滤内容"
)
/**
* 清洗后数据
*/
@Field
(
"content"
)
private
String
content
;
private
String
content
;
@ApiModelProperty
(
name
=
"create_time"
,
value
=
"创建时间"
)
/**
* 创建时间
*/
@Field
(
"create_time"
)
private
Date
createTime
;
private
Date
createTime
;
}
}
application-webadmin/src/main/java/com/yice/webadmin/app/service/impl/DatasetCleanServiceImpl.java
View file @
0634e753
...
@@ -13,7 +13,6 @@ import com.yice.common.core.object.MyPageParam;
...
@@ -13,7 +13,6 @@ import com.yice.common.core.object.MyPageParam;
import
com.yice.common.core.object.MyRelationParam
;
import
com.yice.common.core.object.MyRelationParam
;
import
com.yice.common.core.util.MyModelUtil
;
import
com.yice.common.core.util.MyModelUtil
;
import
com.yice.common.sequence.wrapper.IdGeneratorWrapper
;
import
com.yice.common.sequence.wrapper.IdGeneratorWrapper
;
import
com.yice.webadmin.app.config.PythonConfig
;
import
com.yice.webadmin.app.constant.DatasetConstant
;
import
com.yice.webadmin.app.constant.DatasetConstant
;
import
com.yice.webadmin.app.dao.DatasetCleanConfigMapper
;
import
com.yice.webadmin.app.dao.DatasetCleanConfigMapper
;
import
com.yice.webadmin.app.dao.DatasetCleanMapper
;
import
com.yice.webadmin.app.dao.DatasetCleanMapper
;
...
@@ -33,12 +32,10 @@ import org.springframework.scheduling.annotation.Async;
...
@@ -33,12 +32,10 @@ import org.springframework.scheduling.annotation.Async;
import
org.springframework.scheduling.annotation.AsyncResult
;
import
org.springframework.scheduling.annotation.AsyncResult
;
import
org.springframework.stereotype.Service
;
import
org.springframework.stereotype.Service
;
import
org.springframework.transaction.annotation.Transactional
;
import
org.springframework.transaction.annotation.Transactional
;
import
org.springframework.web.multipart.MultipartFile
;
import
java.io.File
;
import
java.io.FileWriter
;
import
java.io.IOException
;
import
java.io.IOException
;
import
java.nio.file.Files
;
import
java.nio.file.Path
;
import
java.nio.file.Paths
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.List
;
...
@@ -68,8 +65,6 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
...
@@ -68,8 +65,6 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
private
DatasetDataService
datasetDataService
;
private
DatasetDataService
datasetDataService
;
@Autowired
@Autowired
private
DatasetVersionService
datasetVersionService
;
private
DatasetVersionService
datasetVersionService
;
@Autowired
private
PythonConfig
pythonConfig
;
private
final
ConcurrentHashMap
<
Long
,
Future
<?>>
futures
=
new
ConcurrentHashMap
<>();
private
final
ConcurrentHashMap
<
Long
,
Future
<?>>
futures
=
new
ConcurrentHashMap
<>();
...
@@ -123,11 +118,22 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
...
@@ -123,11 +118,22 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
datasetVersion
.
setCleanStatus
(
DatasetConstant
.
CLEAN_PROGRESS
);
datasetVersion
.
setCleanStatus
(
DatasetConstant
.
CLEAN_PROGRESS
);
datasetVersionService
.
updateById
(
datasetVersion
);
datasetVersionService
.
updateById
(
datasetVersion
);
//TODO:未完成
doDatasetCleanHandler
(
datasetClean
.
getDatasetId
(),
datasetClean
.
getCleanId
());
//doDatasetCleanHandler(datasetClean.getDatasetId(), datasetClean.getCleanId());
return
datasetClean
;
return
datasetClean
;
}
}
@Async
(
"taskExecutor"
)
public
Future
<
Void
>
executeCleanTaskAsync
(
List
<
DatasetData
>
dataList
,
Long
cleanId
,
Long
datasetId
)
{
asyncDealWithDatasetSaveBatch
(
dataList
,
cleanId
);
dealWithTaskHandler
(
datasetId
,
cleanId
);
DatasetClean
filter
=
new
DatasetClean
();
filter
.
setCleanStatus
(
DatasetConstant
.
CLEAN_FINISHED
);
filter
.
setFinishTime
(
new
Date
());
this
.
updateById
(
filter
);
return
new
AsyncResult
<>(
null
);
}
/**
/**
* 清洗100个样本
* 清洗100个样本
* @param datasetId 数据集对应的版本
* @param datasetId 数据集对应的版本
...
@@ -143,19 +149,6 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
...
@@ -143,19 +149,6 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
}
}
}
}
@Async
(
"taskExecutor"
)
public
Future
<
Void
>
executeCleanTaskAsync
(
List
<
DatasetData
>
dataList
,
Long
cleanId
,
Long
datasetId
)
{
asyncDealWithDatasetSaveBatch
(
dataList
,
cleanId
);
dealWithTaskHandler
(
datasetId
,
cleanId
);
dealWithDatasetFileDataHandler
(
datasetId
);
DatasetClean
filter
=
new
DatasetClean
();
filter
.
setCleanStatus
(
DatasetConstant
.
CLEAN_FINISHED
);
filter
.
setFinishTime
(
new
Date
());
this
.
updateById
(
filter
);
return
new
AsyncResult
<>(
null
);
}
/**
/**
* 只异步处理前100条数据清洗数据
* 只异步处理前100条数据清洗数据
* @param dataList 清洗列表
* @param dataList 清洗列表
...
@@ -206,18 +199,27 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
...
@@ -206,18 +199,27 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
rules
.
add
(
config
.
getDesensitiveConfig
());
rules
.
add
(
config
.
getDesensitiveConfig
());
rules
.
add
(
config
.
getDeduplicateConfig
());
rules
.
add
(
config
.
getDeduplicateConfig
());
rules
.
add
(
config
.
getCleanConfig
());
rules
.
add
(
config
.
getCleanConfig
());
rules
=
rules
.
stream
()
.
filter
(
rule
->
rule
!=
null
&&
!
rule
.
isEmpty
())
.
collect
(
Collectors
.
toList
());
rules
=
JsonNameExtractor
.
extractNames
(
rules
);
rules
=
JsonNameExtractor
.
extractNames
(
rules
);
}
}
DatasetVersion
datasetVersion
=
this
.
datasetVersionService
.
getById
(
datasetId
);
datasetVersionService
.
saveDatasetInfo
(
datasetVersion
.
getVersionName
());
clearFileDatasetData
(
datasetVersion
.
getFileUrl
());
Long
count
=
datasetDataService
.
count
(
datasetId
);
Long
count
=
datasetDataService
.
count
(
datasetId
);
int
pageSize
=
DatasetConstant
.
MAX_SIZE
;
int
pageSize
=
DatasetConstant
.
MAX_SIZE
;
int
totalPages
=
(
int
)
Math
.
ceil
((
double
)
count
/
pageSize
);
int
totalPages
=
(
int
)
Math
.
ceil
((
double
)
count
/
pageSize
);
MyPageParam
param
=
null
;
MyPageParam
param
;
for
(
int
i
=
1
;
i
<=
totalPages
;
i
++)
{
for
(
int
i
=
1
;
i
<=
totalPages
;
i
++)
{
param
=
new
MyPageParam
();
param
=
new
MyPageParam
();
param
.
setPageNum
(
i
);
param
.
setPageNum
(
i
);
param
.
setPageSize
(
pageSize
);
param
.
setPageSize
(
pageSize
);
dealWithDatasetNodeData
(
datasetDataService
.
list
(
datasetId
,
param
),
datasetId
,
rules
);
List
<
DatasetData
>
dataList
=
datasetDataService
.
list
(
datasetId
,
param
);
dealWithDatasetNodeData
(
dataList
,
datasetId
,
rules
);
appendDataListToFile
(
datasetVersion
.
getFileUrl
()
,
dataList
);
}
}
}
catch
(
Exception
ex
)
{
}
catch
(
Exception
ex
)
{
log
.
error
(
"deal with task handler is error:"
,
ex
);
log
.
error
(
"deal with task handler is error:"
,
ex
);
...
@@ -225,42 +227,56 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
...
@@ -225,42 +227,56 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
}
}
/**
/**
*
写入到json中的配置文件中
*
第一个方法:清空文件
* @param
datasetId 数据集Id
* @param
filePath 文件地址
*/
*/
private
void
dealWithDatasetFileDataHandler
(
Long
datasetId
)
{
public
void
clearFileDatasetData
(
String
filePath
)
{
File
file
=
new
File
(
filePath
);
if
(
file
.
exists
())
{
FileWriter
fileWriter
=
null
;
try
{
try
{
DatasetVersion
datasetVersion
=
this
.
datasetVersionService
.
getById
(
datasetId
);
fileWriter
=
new
FileWriter
(
file
,
true
);
String
versionName
=
datasetVersion
.
getVersionName
();
fileWriter
.
write
(
""
);
//再存储数据集配置文件
}
catch
(
IOException
e
)
{
datasetVersionService
.
saveDatasetInfo
(
versionName
);
log
.
error
(
"clearFileDatasetData method is error:"
,
e
);
}
catch
(
Exception
ex
){
}
finally
{
log
.
error
(
"deal with dataset node data:"
,
ex
);
if
(
fileWriter
!=
null
)
{
try
{
fileWriter
.
close
();
}
catch
(
IOException
e
)
{
log
.
error
(
"file write close is errot"
,
e
);
}
}
}
}
}
}
}
/**
/**
* 保存导入文件。
* 第二个方法:将数据列表追加到文件
*
* @param filePath 文件地址
* @param importFile 导入的文件。
* @param dataList 数据集列表
* @return 保存的本地文件名。
*/
*/
private
String
saveDatasetFile
(
MultipartFile
importFile
,
String
versionName
,
Long
versionId
)
throws
IOException
{
public
void
appendDataListToFile
(
String
filePath
,
List
<
DatasetData
>
dataList
)
{
String
fullName
=
pythonConfig
.
getDatasetFileBaseDir
()
+
versionName
+
".json"
;
FileWriter
fileWriter
=
null
;
try
{
fileWriter
=
new
FileWriter
(
filePath
,
true
);
// 遍历你的数据列表,并将每一条数据写入到文件中
for
(
DatasetData
data
:
dataList
)
{
fileWriter
.
write
(
data
.
getData
());
fileWriter
.
write
(
"\n"
);
}
}
catch
(
IOException
e
)
{
log
.
error
(
"file write close is errot"
,
e
);
}
finally
{
// 如果fileWriter不为空,关闭它
if
(
fileWriter
!=
null
)
{
try
{
try
{
byte
[]
bytes
=
importFile
.
getBytes
();
fileWriter
.
close
();
Path
path
=
Paths
.
get
(
fullName
);
// 如果没有files文件夹,则创建
if
(!
Files
.
isWritable
(
path
))
{
Files
.
createDirectories
(
Paths
.
get
(
pythonConfig
.
getDatasetFileBaseDir
()));
}
// 文件写入指定路径、应该是追加到文件里面
Files
.
write
(
path
,
bytes
);
}
catch
(
IOException
e
)
{
}
catch
(
IOException
e
)
{
log
.
error
(
"Failed to write imported file ["
+
importFile
.
getOriginalFilename
()
+
" ]."
,
e
);
log
.
error
(
"file write close is errot"
,
e
);
throw
e
;
}
}
}
}
return
fullName
;
}
}
/**
/**
...
@@ -276,8 +292,8 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
...
@@ -276,8 +292,8 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
for
(
DatasetData
datasetData
:
dataList
)
{
for
(
DatasetData
datasetData
:
dataList
)
{
JsonNode
rootNode
=
objectMapper
.
readTree
(
datasetData
.
getData
());
JsonNode
rootNode
=
objectMapper
.
readTree
(
datasetData
.
getData
());
String
data
=
rootNode
.
get
(
DatasetConstant
.
OUTPUT
).
textValue
();
String
data
=
rootNode
.
get
(
DatasetConstant
.
OUTPUT
).
textValue
();
String
output
=
DataCleanerUtil
.
buildCleanAfterData
(
data
setData
.
getData
(),
rules
);
String
output
=
DataCleanerUtil
.
buildCleanAfterData
(
data
,
rules
);
datasetData
.
setData
(
createNewDataNode
(
data
,
output
));
datasetData
.
setData
(
createNewDataNode
(
data
setData
.
getData
()
,
output
));
}
}
this
.
datasetDataService
.
updateBatch
(
dataList
,
datasetId
);
this
.
datasetDataService
.
updateBatch
(
dataList
,
datasetId
);
}
}
...
@@ -291,27 +307,27 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
...
@@ -291,27 +307,27 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
* @param json Json对象
* @param json Json对象
* @param output 输出对象
* @param output 输出对象
* @return 新的data数据
* @return 新的data数据
* @throws JsonProcessingException
* @throws JsonProcessingException
转换异常
*/
*/
private
String
createNewDataNode
(
String
json
,
String
output
)
throws
JsonProcessingException
{
private
String
createNewDataNode
(
String
json
,
String
output
)
throws
JsonProcessingException
{
String
modifiedJson
=
null
;
String
modifiedJson
=
null
;
ObjectMapper
mapper
=
new
ObjectMapper
();
ObjectMapper
mapper
=
new
ObjectMapper
();
JsonNode
rootNode
=
mapper
.
readTree
(
json
);
JsonNode
rootNode
=
mapper
.
readTree
(
json
);
// 获取data节点
// 检查output字段是否存在
JsonNode
dataNode
=
rootNode
.
get
(
DatasetConstant
.
DATA
);
if
(
rootNode
.
has
(
DatasetConstant
.
OUTPUT
))
{
if
(
dataNode
!=
null
&&
dataNode
.
isObject
())
{
// 转换
data
节点为ObjectNode以便修改
// 转换
root
节点为ObjectNode以便修改
ObjectNode
dataObjectNode
=
(
ObjectNode
)
data
Node
;
ObjectNode
objectNode
=
(
ObjectNode
)
root
Node
;
// 替换output字段的值
// 替换output字段的值
dataO
bjectNode
.
put
(
DatasetConstant
.
OUTPUT
,
output
);
o
bjectNode
.
put
(
DatasetConstant
.
OUTPUT
,
output
);
// 将修改后的JSON转换回字符串
// 将修改后的JSON转换回字符串
modifiedJson
=
mapper
.
writerWithDefaultPrettyPrinter
().
writeValueAsString
(
roo
tNode
);
modifiedJson
=
mapper
.
writerWithDefaultPrettyPrinter
().
writeValueAsString
(
objec
tNode
);
}
System
.
out
.
println
(
modifiedJson
);
}
return
modifiedJson
;
return
modifiedJson
;
}
}
...
@@ -326,14 +342,11 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
...
@@ -326,14 +342,11 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
private
List
<
DatasetDataClean
>
asyncCleanTaskHandler
(
List
<
DatasetData
>
dataList
,
Long
cleanId
)
{
private
List
<
DatasetDataClean
>
asyncCleanTaskHandler
(
List
<
DatasetData
>
dataList
,
Long
cleanId
)
{
List
<
DatasetDataClean
>
cleans
=
new
ArrayList
<>();
List
<
DatasetDataClean
>
cleans
=
new
ArrayList
<>();
try
{
try
{
List
<
String
>
rules
=
new
ArrayList
<>();
DatasetCleanConfig
filter
=
new
DatasetCleanConfig
();
DatasetCleanConfig
filter
=
new
DatasetCleanConfig
();
filter
.
setCleanId
(
cleanId
);
filter
.
setCleanId
(
cleanId
);
DatasetCleanConfig
cleanConfig
=
datasetCleanConfigService
.
getOne
(
filter
);
DatasetCleanConfig
cleanConfig
=
datasetCleanConfigService
.
getOne
(
filter
);
if
(
null
!=
cleanConfig
)
{
if
(
null
!=
cleanConfig
&&
null
==
cleanConfig
.
getCleanConfig
())
return
cleans
;
rules
=
JsonNameExtractor
.
extractNames
(
cleanConfig
.
getCleanConfig
());
List
<
String
>
rules
=
JsonNameExtractor
.
extractNames
(
cleanConfig
.
getCleanConfig
());
}
if
(
CollUtil
.
isNotEmpty
(
dataList
))
{
if
(
CollUtil
.
isNotEmpty
(
dataList
))
{
ObjectMapper
objectMapper
=
new
ObjectMapper
();
ObjectMapper
objectMapper
=
new
ObjectMapper
();
for
(
DatasetData
datasetData:
dataList
)
{
for
(
DatasetData
datasetData:
dataList
)
{
...
@@ -363,20 +376,17 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
...
@@ -363,20 +376,17 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
private
List
<
DatasetDataDesensitive
>
asyncDesensitiveTaskHandler
(
List
<
DatasetData
>
dataList
,
Long
cleanId
)
{
private
List
<
DatasetDataDesensitive
>
asyncDesensitiveTaskHandler
(
List
<
DatasetData
>
dataList
,
Long
cleanId
)
{
List
<
DatasetDataDesensitive
>
desensitives
=
new
ArrayList
<>();
List
<
DatasetDataDesensitive
>
desensitives
=
new
ArrayList
<>();
try
{
try
{
List
<
String
>
rules
=
new
ArrayList
<>();
DatasetCleanConfig
filter
=
new
DatasetCleanConfig
();
DatasetCleanConfig
filter
=
new
DatasetCleanConfig
();
filter
.
setCleanId
(
cleanId
);
filter
.
setCleanId
(
cleanId
);
DatasetCleanConfig
cleanConfig
=
datasetCleanConfigService
.
getOne
(
filter
);
DatasetCleanConfig
cleanConfig
=
datasetCleanConfigService
.
getOne
(
filter
);
if
(
null
!=
cleanConfig
)
{
if
(
null
!=
cleanConfig
&&
null
==
cleanConfig
.
getDesensitiveConfig
())
return
desensitives
;
rules
=
JsonNameExtractor
.
extractNames
(
cleanConfig
.
getDesensitiveConfig
());
List
<
String
>
rules
=
JsonNameExtractor
.
extractNames
(
cleanConfig
.
getDesensitiveConfig
());
}
ObjectMapper
objectMapper
=
new
ObjectMapper
();
ObjectMapper
objectMapper
=
new
ObjectMapper
();
if
(
CollUtil
.
isNotEmpty
(
dataList
))
{
if
(
CollUtil
.
isNotEmpty
(
dataList
))
{
for
(
DatasetData
datasetData:
dataList
)
{
for
(
DatasetData
datasetData:
dataList
)
{
DatasetDataDesensitive
desensitive
=
new
DatasetDataDesensitive
();
DatasetDataDesensitive
desensitive
=
new
DatasetDataDesensitive
();
JsonNode
rootN
ode
=
objectMapper
.
readTree
(
datasetData
.
getData
());
JsonNode
n
ode
=
objectMapper
.
readTree
(
datasetData
.
getData
());
String
data
=
rootN
ode
.
get
(
DatasetConstant
.
OUTPUT
).
textValue
();
String
data
=
n
ode
.
get
(
DatasetConstant
.
OUTPUT
).
textValue
();
desensitive
.
setCleanBeforeData
(
data
);
desensitive
.
setCleanBeforeData
(
data
);
desensitive
.
setCleanAfterData
(
DataCleanerUtil
.
buildCleanAfterData
(
data
,
rules
));
desensitive
.
setCleanAfterData
(
DataCleanerUtil
.
buildCleanAfterData
(
data
,
rules
));
desensitive
.
setCleanId
(
cleanId
);
desensitive
.
setCleanId
(
cleanId
);
...
@@ -400,20 +410,17 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
...
@@ -400,20 +410,17 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
private
List
<
DatasetDataDeduplicate
>
asyncDeduplicateTaskHandler
(
List
<
DatasetData
>
dataList
,
Long
cleanId
)
{
private
List
<
DatasetDataDeduplicate
>
asyncDeduplicateTaskHandler
(
List
<
DatasetData
>
dataList
,
Long
cleanId
)
{
List
<
DatasetDataDeduplicate
>
deduplicates
=
new
ArrayList
<>();
List
<
DatasetDataDeduplicate
>
deduplicates
=
new
ArrayList
<>();
try
{
try
{
List
<
String
>
rules
=
new
ArrayList
<>();
DatasetCleanConfig
filter
=
new
DatasetCleanConfig
();
DatasetCleanConfig
filter
=
new
DatasetCleanConfig
();
filter
.
setCleanId
(
cleanId
);
filter
.
setCleanId
(
cleanId
);
DatasetCleanConfig
deduplicateConfig
=
datasetCleanConfigService
.
getOne
(
filter
);
DatasetCleanConfig
deduplicateConfig
=
datasetCleanConfigService
.
getOne
(
filter
);
if
(
null
!=
deduplicateConfig
)
{
if
(
null
!=
deduplicateConfig
&&
null
==
deduplicateConfig
.
getDeduplicateConfig
())
return
deduplicates
;
rules
=
JsonNameExtractor
.
extractNames
(
deduplicateConfig
.
getDeduplicateConfig
());
List
<
String
>
rules
=
JsonNameExtractor
.
extractNames
(
deduplicateConfig
.
getDeduplicateConfig
());
}
ObjectMapper
objectMapper
=
new
ObjectMapper
();
ObjectMapper
objectMapper
=
new
ObjectMapper
();
if
(
CollUtil
.
isNotEmpty
(
dataList
))
{
if
(
CollUtil
.
isNotEmpty
(
dataList
))
{
for
(
DatasetData
datasetData:
dataList
)
{
for
(
DatasetData
datasetData:
dataList
)
{
DatasetDataDeduplicate
deduplicate
=
new
DatasetDataDeduplicate
();
DatasetDataDeduplicate
deduplicate
=
new
DatasetDataDeduplicate
();
JsonNode
root
Node
=
objectMapper
.
readTree
(
datasetData
.
getData
());
JsonNode
json
Node
=
objectMapper
.
readTree
(
datasetData
.
getData
());
String
data
=
root
Node
.
get
(
DatasetConstant
.
OUTPUT
).
textValue
();
String
data
=
json
Node
.
get
(
DatasetConstant
.
OUTPUT
).
textValue
();
deduplicate
.
setCleanBeforeData
(
data
);
deduplicate
.
setCleanBeforeData
(
data
);
deduplicate
.
setCleanAfterData
(
DataCleanerUtil
.
buildCleanAfterData
(
data
,
rules
));
deduplicate
.
setCleanAfterData
(
DataCleanerUtil
.
buildCleanAfterData
(
data
,
rules
));
deduplicate
.
setCleanId
(
cleanId
);
deduplicate
.
setCleanId
(
cleanId
);
...
@@ -437,14 +444,11 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
...
@@ -437,14 +444,11 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
private
List
<
DatasetDataFilter
>
asyncFilterTaskHandler
(
List
<
DatasetData
>
dataList
,
Long
cleanId
)
{
private
List
<
DatasetDataFilter
>
asyncFilterTaskHandler
(
List
<
DatasetData
>
dataList
,
Long
cleanId
)
{
List
<
DatasetDataFilter
>
filters
=
new
ArrayList
<>();
List
<
DatasetDataFilter
>
filters
=
new
ArrayList
<>();
try
{
try
{
List
<
String
>
rules
=
new
ArrayList
<>();
DatasetCleanConfig
filter
=
new
DatasetCleanConfig
();
DatasetCleanConfig
filter
=
new
DatasetCleanConfig
();
filter
.
setCleanId
(
cleanId
);
filter
.
setCleanId
(
cleanId
);
DatasetCleanConfig
filterConfig
=
datasetCleanConfigService
.
getOne
(
filter
);
DatasetCleanConfig
filterConfig
=
datasetCleanConfigService
.
getOne
(
filter
);
if
(
null
!=
filterConfig
)
{
if
(
null
!=
filterConfig
&&
null
==
filterConfig
.
getFilterConfig
())
return
filters
;
rules
=
JsonNameExtractor
.
extractNames
(
filterConfig
.
getFilterConfig
());
List
<
String
>
rules
=
JsonNameExtractor
.
extractNames
(
filterConfig
.
getFilterConfig
());
}
ObjectMapper
objectMapper
=
new
ObjectMapper
();
ObjectMapper
objectMapper
=
new
ObjectMapper
();
if
(
CollUtil
.
isNotEmpty
(
dataList
))
{
if
(
CollUtil
.
isNotEmpty
(
dataList
))
{
for
(
DatasetData
datasetData:
dataList
)
{
for
(
DatasetData
datasetData:
dataList
)
{
...
...
application-webadmin/src/main/java/com/yice/webadmin/app/service/impl/DatasetDataServiceImpl.java
View file @
0634e753
package
com
.
yice
.
webadmin
.
app
.
service
.
impl
;
package
com
.
yice
.
webadmin
.
app
.
service
.
impl
;
import
cn.hutool.core.collection.CollUtil
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.yice.common.core.object.MyPageParam
;
import
com.yice.common.core.object.MyPageParam
;
...
@@ -13,6 +14,7 @@ import org.springframework.beans.factory.annotation.Autowired;
...
@@ -13,6 +14,7 @@ import org.springframework.beans.factory.annotation.Autowired;
import
org.springframework.data.mongodb.core.MongoTemplate
;
import
org.springframework.data.mongodb.core.MongoTemplate
;
import
org.springframework.data.mongodb.core.query.Criteria
;
import
org.springframework.data.mongodb.core.query.Criteria
;
import
org.springframework.data.mongodb.core.query.Query
;
import
org.springframework.data.mongodb.core.query.Query
;
import
org.springframework.data.mongodb.core.query.Update
;
import
org.springframework.stereotype.Service
;
import
org.springframework.stereotype.Service
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
...
@@ -157,7 +159,22 @@ public class DatasetDataServiceImpl implements DatasetDataService {
...
@@ -157,7 +159,22 @@ public class DatasetDataServiceImpl implements DatasetDataService {
*/
*/
@Override
@Override
public
void
updateBatch
(
List
<
DatasetData
>
dataList
,
Long
versionId
)
{
public
void
updateBatch
(
List
<
DatasetData
>
dataList
,
Long
versionId
)
{
mongoTemplate
.
save
(
dataList
,
MongoConstant
.
COLLECT_NAME
+
versionId
);
if
(
CollUtil
.
isNotEmpty
(
dataList
))
{
for
(
DatasetData
datasetData
:
dataList
)
{
// 解析data字段的字符串为Document或Bson
Document
dataDocument
=
Document
.
parse
(
datasetData
.
getData
());
// 构建查询条件
Query
query
=
new
Query
(
Criteria
.
where
(
MongoConstant
.
ID
).
is
(
datasetData
.
getId
()));
// 构建更新操作
Update
update
=
new
Update
();
update
.
set
(
MongoConstant
.
DATA
,
dataDocument
);
// 执行更新操作
mongoTemplate
.
updateFirst
(
query
,
update
,
MongoConstant
.
COLLECT_NAME
+
versionId
);
}
}
}
}
/**
/**
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment