Commit 74073a00 authored by pengxin's avatar pengxin

调整清洗事务管理。

parent 2cc62002
...@@ -121,7 +121,6 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp ...@@ -121,7 +121,6 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
return datasetClean; return datasetClean;
} }
@Transactional(rollbackFor = Exception.class)
@Async("taskExecutor") @Async("taskExecutor")
public Future<Void> executeCleanTaskAsync(List<DatasetData> dataList, Long cleanId, Long datasetId) { public Future<Void> executeCleanTaskAsync(List<DatasetData> dataList, Long cleanId, Long datasetId) {
asyncDealWithDatasetSaveBatch(dataList, cleanId); asyncDealWithDatasetSaveBatch(dataList, cleanId);
...@@ -194,6 +193,12 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp ...@@ -194,6 +193,12 @@ public class DatasetCleanServiceImpl extends BaseService<DatasetClean, Long> imp
if(null != clean){ if(null != clean){
doDatasetCleanHandler(clean.getDatasetId(), cleanId); doDatasetCleanHandler(clean.getDatasetId(), cleanId);
} }
//重新清洗
DatasetClean filter = new DatasetClean();
filter.setCleanStatus(DatasetConstant.CLEAN_PROGRESS);
filter.setFinishTime(null);
filter.setCleanId(cleanId);
this.updateById(filter);
} }
/** /**
......
...@@ -65,7 +65,7 @@ public class DataCleanerUtil { ...@@ -65,7 +65,7 @@ public class DataCleanerUtil {
data = data.replaceAll("[\\x00-\\x1F\\x7F-\\x9F]", ""); data = data.replaceAll("[\\x00-\\x1F\\x7F-\\x9F]", "");
break; break;
case DatasetCleanConstant.REPLACE_UNIFORM_WHITESPACE: case DatasetCleanConstant.REPLACE_UNIFORM_WHITESPACE:
data = data.replaceAll("[\\u0020\\u00A0\\u2000-\\u200A\\u2028\\u2029\\u3000]+", ""); data = data.replaceAll("(?<![a-zA-Z])[\\u0020\\u00A0\\u2000-\\u200A\\u2028\\u2029\\u3000](?![a-zA-Z])", "");
break; break;
case DatasetCleanConstant.REMOVE_NON_MEANING_CHARACTERS: case DatasetCleanConstant.REMOVE_NON_MEANING_CHARACTERS:
data = data.replaceAll("[\\p{Cntrl}\\p{Cn}]", ""); data = data.replaceAll("[\\p{Cntrl}\\p{Cn}]", "");
...@@ -107,6 +107,12 @@ public class DataCleanerUtil { ...@@ -107,6 +107,12 @@ public class DataCleanerUtil {
return data; return data;
} }
public static void main(String [] args) {
String data = "我可以提供各种类型的帮助,包括回答问题、提供信息、解决$#问题、提供建议等。只要是我能力范围内^*的需求,我都会尽力帮助用户解决。";
data = filterSpecialCharacters(data, 0.2);
System.out.println(data);
}
/** /**
* 计算阀值 * 计算阀值
* @param data 计算坏的数据 * @param data 计算坏的数据
...@@ -162,22 +168,25 @@ public class DataCleanerUtil { ...@@ -162,22 +168,25 @@ public class DataCleanerUtil {
Map<String, Integer> wordCountMap = new HashMap<>(); Map<String, Integer> wordCountMap = new HashMap<>();
for (Term term : termList) { for (Term term : termList) {
String word = term.word; String word = term.word;
wordCountMap.put(word, wordCountMap.getOrDefault(word, 0) + 1); // 忽略空格
if (StringUtils.isNotBlank(word)) {
wordCountMap.put(word, wordCountMap.getOrDefault(word, 0) + 1);
}
} }
// 计算总词数 // 找到出现次数最多的词
int totalWords = termList.size(); String mostFrequentWord = wordCountMap.entrySet().stream()
.max(Comparator.comparingInt(Map.Entry::getValue))
.map(Map.Entry::getKey)
.orElse(null);
// 计算重复词的次数(即出现次数大于1的词的总次数) // 如果没有词出现超过一次,直接返回原字符串
int repeatedWordsCount = 0; if (mostFrequentWord == null) {
for (int count : wordCountMap.values()) { return document;
if (count > 1) {
// 只计算重复的次数
repeatedWordsCount += count - 1;
}
} }
// 计算词重复率
double repetitionRate = (double) repeatedWordsCount / totalWords; // 计算最大词的重复率
double repetitionRate = (double) (wordCountMap.get(mostFrequentWord) - 1) / (double) termList.size();
return repetitionRate > threshold ? DatasetConstant.EMPTY_STR : document; return repetitionRate > threshold ? DatasetConstant.EMPTY_STR : document;
} }
...@@ -188,7 +197,6 @@ public class DataCleanerUtil { ...@@ -188,7 +197,6 @@ public class DataCleanerUtil {
* @param threshold 阀值 * @param threshold 阀值
* @return 清洗后的数据 * @return 清洗后的数据
*/ */
public static String filterWordRepetition(String text, double threshold) { public static String filterWordRepetition(String text, double threshold) {
// 将文本转换为字符数组 // 将文本转换为字符数组
char[] characters = text.toCharArray(); char[] characters = text.toCharArray();
...@@ -196,25 +204,26 @@ public class DataCleanerUtil { ...@@ -196,25 +204,26 @@ public class DataCleanerUtil {
// 统计每个字的出现次数 // 统计每个字的出现次数
Map<Character, Integer> characterCountMap = new HashMap<>(); Map<Character, Integer> characterCountMap = new HashMap<>();
for (char c : characters) { for (char c : characters) {
characterCountMap.put(c, characterCountMap.getOrDefault(c, 0) + 1); // 忽略空白符
if (!Character.isWhitespace(c)) {
characterCountMap.put(c, characterCountMap.getOrDefault(c, 0) + 1);
}
} }
// 计算总字数 // 找到出现次数最多的字
int totalCharacters = characters.length; Character mostFrequentCharacter = characterCountMap.entrySet().stream()
.max(Comparator.comparingInt(Map.Entry::getValue))
.map(Map.Entry::getKey)
.orElse(null);
// 计算重复字的次数(即出现次数大于1的字的总次数) // 如果没有字出现超过一次,直接返回原字符串
int repeatedCharactersCount = 0; if (mostFrequentCharacter == null) {
for (int count : characterCountMap.values()) { return text;
if (count > 1) {
// 只计算重复的次数
repeatedCharactersCount += count - 1;
}
} }
// 计算重复率 // 计算最大字的重复率
double repetitionRate = (double) repeatedCharactersCount / totalCharacters; double repetitionRate = (double) (characterCountMap.get(mostFrequentCharacter) - 1) / (double) characters.length;
//根据阀值判断进行数据返回
return repetitionRate > threshold ? DatasetConstant.EMPTY_STR : text; return repetitionRate > threshold ? DatasetConstant.EMPTY_STR : text;
} }
...@@ -227,8 +236,8 @@ public class DataCleanerUtil { ...@@ -227,8 +236,8 @@ public class DataCleanerUtil {
private static String filterSpecialCharacters(String data, double radio) { private static String filterSpecialCharacters(String data, double radio) {
StringBuffer result = new StringBuffer(); StringBuffer result = new StringBuffer();
double specialCharacterRatio = calculateSpecialCharacterRatio(data); double specialCharacterRatio = calculateSpecialCharacterRatio(data);
if (specialCharacterRatio > radio) { if (specialCharacterRatio <= radio) {
result.append(data.replaceAll("[#$%^&*()]", "")); result.append(data.replaceAll("[#$^&*()]", ""));
} else { } else {
result.append(data); result.append(data);
} }
...@@ -258,7 +267,7 @@ public class DataCleanerUtil { ...@@ -258,7 +267,7 @@ public class DataCleanerUtil {
* @param radio 最大词条目数据 * @param radio 最大词条目数据
* @return 返回截取后的字符串 * @return 返回截取后的字符串
*/ */
public static String filterNumberWords(String data, int radio) { private static String filterNumberWords(String data, int radio) {
data = data.toLowerCase(); data = data.toLowerCase();
if (data.length() <= radio) { if (data.length() <= radio) {
return data; return data;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment