题解 | 找出相似度最高的文档
找出相似度最高的文档
https://www.nowcoder.com/practice/2494b4f0c5484f0eb5ee192b8e26c3c7
import java.util.*;
import java.io.*;
// 注意类名必须为 Main, 不要有任何 package xxx 信息
public class Main {
public static void main(String[] args) throws Exception{
BufferedReader bf = new BufferedReader(new InputStreamReader(System.in));
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(System.out));
int n = Integer.valueOf(bf.readLine());
String[] lines = new String[n];
for (int i = 0; i < n; i++){
lines[i] = bf.readLine();
// bw.write(line);
}
int k = Integer.valueOf(bf.readLine());
int p = Integer.valueOf(bf.readLine());
for (int i = 0; i < p; i++){
String[] line = bf.readLine().split(" ");
int t = Math.min(Integer.valueOf(line[0]), n - 1);
String[] info = Arrays.copyOfRange(line, 1, line.length);
int bestdoc = calc(t, k, lines, info);
bw.write(String.valueOf(bestdoc));
if (i != p - 1){
bw.write(" ");
}
}
bw.flush();
bw.close();
}
// 计算词向量
@SuppressWarnings("unchecked")
private static int calc(int t, int k, String[] lines, String[] info){
int start = t - k + 1;
Map<String, Integer>[] f = calcF(lines, start, t);
Map<String, Integer> tf = f[0];
Map<String, Integer> df = f[1];
int index = start;
Map<String, Double> embedding = new HashMap<>();
for (Map.Entry<String, Integer> entry : df.entrySet()){
String key = entry.getKey();
int value = entry.getValue();
double idfItem = Math.log((double)(k + 1) / (double)(value + 1)) + 1.0;
int tfItem = tf.get(key);
double tfIDF = (double) tfItem * idfItem;
embedding.put(key, tfIDF);
}
Map<String, Double> queryEmbeding = new HashMap<>();
double queryNorm = 0.0;
for (String word : info){
double tfIDF = embedding.getOrDefault(word, 0.0);
queryNorm += tfIDF * tfIDF;
queryEmbeding.put(word, tfIDF);
}
queryNorm = Math.sqrt(queryNorm);
Map<String, Double>[] documentEmbedings = new HashMap[k];
Arrays.setAll(documentEmbedings, vector -> new HashMap<>());
double maxSimilarity = -1.0;
int bestDoc = -1;
double[] weights = new double[k];
for (int i = 1; i <= k; i++){
weights[i - 1] = i / k;
}
for (Map<String, Double> documentEmbeding : documentEmbedings){
String line = lines[index];
double documentNorm = 0.0;
// double weight = (double)((index - start) + 1) / (double) (k);
for (String word : line.split(" ")){
double tfIDF = embedding.getOrDefault(word, 0.0);
documentNorm += tfIDF * tfIDF;
documentEmbeding.put(word, tfIDF);
}
documentNorm = Math.sqrt(documentNorm);
// 计算相似度
double dotProduct = 0.0;
for (Map.Entry<String, Double> queryWord : queryEmbeding.entrySet()){
if (documentEmbeding.containsKey(queryWord.getKey())){
dotProduct += documentEmbeding.get(queryWord.getKey()) * queryWord.getValue();
}
}
double similarity = 0.0;
if (queryNorm > 1e-12 && documentNorm > 1e-12) {
similarity = dotProduct / (queryNorm * documentNorm);
}
// System.out.println("similarity:" + similarity + "\n");
// similarity *= weights[index - start];
// 更新最佳文档
if (similarity >= 0.6) {
if (similarity > maxSimilarity) {
maxSimilarity = similarity;
bestDoc = index;
} else if (Math.abs(similarity - maxSimilarity) < 1e-12) {
// 如果相似度相同,选择窗口中更早的文档(即索引更小的)
double temp = similarity * weights[index - start] - maxSimilarity * weights[bestDoc - start];
if (temp < 1e-12){
if (index < bestDoc){
bestDoc = index;
}
} else {
bestDoc = index;
}
}
}
index++;
}
return bestDoc;
}
// 计算词频TF, DF
@SuppressWarnings("unchecked")
private static Map<String, Integer>[] calcF(String[] lines, int start, int t){
Map<String, Integer> freqTF = new HashMap<>();
Map<String, Integer> freqDF = new HashMap<>();
for (int i = start; i <= t; i++){
String line = lines[i];
String[] words = line.split(" ");
Set<String> hashset = new HashSet<>(Arrays.asList(words));
for (String word : words){
freqTF.put(word, freqTF.getOrDefault(word, 0) + 1);
freqDF.put(word, freqTF.getOrDefault(word, 0) + 1);
}
}
return new Map[]{freqTF, freqDF};
}
}
查看12道真题和解析
