Commit 74073a00 authored by pengxin's avatar pengxin

调整清洗事务管理。

parent 2cc62002
......@@ -121,7 +121,6 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
return datasetClean;
}
@Transactional(rollbackFor = Exception.class)
@Async("taskExecutor")
public Future<Void> executeCleanTaskAsync(List<DatasetData> dataList, Long cleanId, Long datasetId) {
asyncDealWithDatasetSaveBatch(dataList, cleanId);
......@@ -194,6 +193,12 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
if(null != clean){
doDatasetCleanHandler(clean.getDatasetId(), cleanId);
}
//重新清洗
DatasetClean filter = new DatasetClean();
filter.setCleanStatus(DatasetConstant.CLEAN_PROGRESS);
filter.setFinishTime(null);
filter.setCleanId(cleanId);
this.updateById(filter);
}
/**
......
......@@ -65,7 +65,7 @@ public class DataCleanerUtil {
data = data.replaceAll("[\\x00-\\x1F\\x7F-\\x9F]", "");
break;
case DatasetCleanConstant.REPLACE_UNIFORM_WHITESPACE:
data = data.replaceAll("[\\u0020\\u00A0\\u2000-\\u200A\\u2028\\u2029\\u3000]+", "");
data = data.replaceAll("(?<![a-zA-Z])[\\u0020\\u00A0\\u2000-\\u200A\\u2028\\u2029\\u3000](?![a-zA-Z])", "");
break;
case DatasetCleanConstant.REMOVE_NON_MEANING_CHARACTERS:
data = data.replaceAll("[\\p{Cntrl}\\p{Cn}]", "");
......@@ -107,6 +107,12 @@ public class DataCleanerUtil {
return data;
}
public static void main(String [] args) {
String data = "我可以提供各种类型的帮助,包括回答问题、提供信息、解决$#问题、提供建议等。只要是我能力范围内^*的需求,我都会尽力帮助用户解决。";
data = filterSpecialCharacters(data, 0.2);
System.out.println(data);
}
/**
* 计算阀值
* @param data 计算坏的数据
......@@ -162,22 +168,25 @@ public class DataCleanerUtil {
Map<String, Integer> wordCountMap = new HashMap<>();
for (Term term : termList) {
String word = term.word;
wordCountMap.put(word, wordCountMap.getOrDefault(word, 0) + 1);
// 忽略空格
if (StringUtils.isNotBlank(word)) {
wordCountMap.put(word, wordCountMap.getOrDefault(word, 0) + 1);
}
}
// 计算总词数
int totalWords = termList.size();
// 找到出现次数最多的词
String mostFrequentWord = wordCountMap.entrySet().stream()
.max(Comparator.comparingInt(Map.Entry::getValue))
.map(Map.Entry::getKey)
.orElse(null);
// 计算重复词的次数(即出现次数大于1的词的总次数)
int repeatedWordsCount = 0;
for (int count : wordCountMap.values()) {
if (count > 1) {
// 只计算重复的次数
repeatedWordsCount += count - 1;
}
// 如果没有词出现超过一次,直接返回原字符串
if (mostFrequentWord == null) {
return document;
}
// 计算词重复率
double repetitionRate = (double) repeatedWordsCount / totalWords;
// 计算最大词的重复率
double repetitionRate = (double) (wordCountMap.get(mostFrequentWord) - 1) / (double) termList.size();
return repetitionRate > threshold ? DatasetConstant.EMPTY_STR : document;
}
......@@ -188,7 +197,6 @@ public class DataCleanerUtil {
* @param threshold 阀值
* @return 清洗后的数据
*/
public static String filterWordRepetition(String text, double threshold) {
// 将文本转换为字符数组
char[] characters = text.toCharArray();
......@@ -196,25 +204,26 @@ public class DataCleanerUtil {
// 统计每个字的出现次数
Map<Character, Integer> characterCountMap = new HashMap<>();
for (char c : characters) {
characterCountMap.put(c, characterCountMap.getOrDefault(c, 0) + 1);
// 忽略空白符
if (!Character.isWhitespace(c)) {
characterCountMap.put(c, characterCountMap.getOrDefault(c, 0) + 1);
}
}
// 计算总字数
int totalCharacters = characters.length;
// 找到出现次数最多的字
Character mostFrequentCharacter = characterCountMap.entrySet().stream()
.max(Comparator.comparingInt(Map.Entry::getValue))
.map(Map.Entry::getKey)
.orElse(null);
// 计算重复字的次数(即出现次数大于1的字的总次数)
int repeatedCharactersCount = 0;
for (int count : characterCountMap.values()) {
if (count > 1) {
// 只计算重复的次数
repeatedCharactersCount += count - 1;
}
// 如果没有字出现超过一次,直接返回原字符串
if (mostFrequentCharacter == null) {
return text;
}
// 计算重复率
double repetitionRate = (double) repeatedCharactersCount / totalCharacters;
// 计算最大字的重复率
double repetitionRate = (double) (characterCountMap.get(mostFrequentCharacter) - 1) / (double) characters.length;
//根据阀值判断进行数据返回
return repetitionRate > threshold ? DatasetConstant.EMPTY_STR : text;
}
......@@ -227,8 +236,8 @@ public class DataCleanerUtil {
private static String filterSpecialCharacters(String data, double radio) {
StringBuffer result = new StringBuffer();
double specialCharacterRatio = calculateSpecialCharacterRatio(data);
if (specialCharacterRatio > radio) {
result.append(data.replaceAll("[#$%^&*()]", ""));
if (specialCharacterRatio <= radio) {
result.append(data.replaceAll("[#$^&*()]", ""));
} else {
result.append(data);
}
......@@ -258,7 +267,7 @@ public class DataCleanerUtil {
* @param radio 最大词条目数据
* @return 返回截取后的字符串
*/
public static String filterNumberWords(String data, int radio) {
private static String filterNumberWords(String data, int radio) {
data = data.toLowerCase();
if (data.length() <= radio) {
return data;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment