In addition to Weibo, there is also WeChat
Please pay attention
WeChat public account
Shulou
2025-01-28 Update From: SLTechnology News&Howtos shulou NAV: SLTechnology News&Howtos > Development >
Share
Shulou(Shulou.com)06/03 Report--
This article mainly explains "how to write the text similarity algorithm CoSine Theorem and SimHash". The content of the article is simple and clear, and it is easy to learn and understand. Please follow Xiaobian's train of thought to study and learn "how to write the text similarity algorithm CoSine Theorem and SimHash under .NET".
The specific analysis is as follows:
CoSine similarity
Principle: first, we segment two paragraphs of text and list all the words, then we calculate the word frequency of each word, and finally convert the words into vectors, so we only need to calculate the similarity between the two vectors.
We simply put it as follows
Text 1: I / Love / Beijing / Tiananmen / get the vector (pseudo vector) after word segmentation.
Text 2: we / all love / Beijing / Tiananmen / get the vector (pseudo vector) after word segmentation.
We can think of them as two segments in space, both starting from the origin ([0,0,.]) and pointing in different directions. An angle is formed between the two segments, if the angle is 0 degrees, it means that the direction is the same and the segments coincide; if the angle is 90 degrees, it means that a right angle is formed and the direction is completely different; if the angle is 180 degrees, it means the opposite direction. Therefore, we can judge the degree of similarity of the vector by the size of the angle. The smaller the angle, the more similar it is.
C# core algorithm:
The copy code is as follows:
Public class TFIDFMeasure
{
Private string [] _ docs
Private string [] [] _ ngramDoc
Private int _ numDocs=0
Private int _ numTerms=0
Private ArrayList _ terms
Private int [] [] _ termFreq
Private float [] [] _ termWeight
Private int [] _ maxTermFreq
Private int [] _ docFreq
Public class TermVector
{
Public static float ComputeCosineSimilarity (float [] vector1, float [] vector2)
{
If (vector1.Length! = vector2.Length)
Throw new Exception ("DIFER LENGTH")
Float denom= (VectorLength (vector1) * VectorLength (vector2))
If (denom = = 0F)
Return 0F
Else
Return (InnerProduct (vector1, vector2) / denom)
}
Public static float InnerProduct (float [] vector1, float [] vector2)
{
If (vector1.Length! = vector2.Length)
Throw new Exception ("DIFFER LENGTH ARE NOT ALLOWED")
Float result=0F
For (int item0; I
< vector1.Length; i++) result += vector1[i] * vector2[i]; return result; } public static float VectorLength(float[] vector) { float sum=0.0F; for (int i=0; i < vector.Length; i++) sum=sum + (vector[i] * vector[i]); return (float)Math.Sqrt(sum); } } private IDictionary _wordsIndex=new Hashtable() ; public TFIDFMeasure(string[] documents) { _docs=documents; _numDocs=documents.Length ; MyInit(); } private void GeneratNgramText() { } private ArrayList GenerateTerms(string[] docs) { ArrayList uniques=new ArrayList() ; _ngramDoc=new string[_numDocs][] ; for (int i=0; i < docs.Length ; i++) { Tokeniser tokenizer=new Tokeniser() ; string[] words=tokenizer.Partition(docs[i]); for (int j=0; j < words.Length ; j++) if (!uniques.Contains(words[j]) ) uniques.Add(words[j]) ; } return uniques; } private static object AddElement(IDictionary collection, object key, object newValue) { object element=collection[key]; collection[key]=newValue; return element; } private int GetTermIndex(string term) { object index=_wordsIndex[term]; if (index == null) return -1; return (int) index; } private void MyInit() { _terms=GenerateTerms (_docs ); _numTerms=_terms.Count ; _maxTermFreq=new int[_numDocs] ; _docFreq=new int[_numTerms] ; _termFreq =new int[_numTerms][] ; _termWeight=new float[_numTerms][] ; for(int i=0; i < _terms.Count ; i++) { _termWeight[i]=new float[_numDocs] ; _termFreq[i]=new int[_numDocs] ; AddElement(_wordsIndex, _terms[i], i); } GenerateTermFrequency (); GenerateTermWeight(); } private float Log(float num) { return (float) Math.Log(num) ;//log2 } private void GenerateTermFrequency() { for(int i=0; i < _numDocs ; i++) { string curDoc=_docs[i]; IDictionary freq=GetWordFrequency(curDoc); IDictionaryEnumerator enums=freq.GetEnumerator() ; _maxTermFreq[i]=int.MinValue ; while (enums.MoveNext()) { string word=(string)enums.Key; int wordFreq=(int)enums.Value ; int termIndex=GetTermIndex(word); _termFreq [termIndex][i]=wordFreq; _docFreq[termIndex] ++; if (wordFreq >_ maxTermFreq [I]) _ maxTermFreq [I] = wordFreq
}
}
}
Private void GenerateTermWeight ()
{
For (int item0; I
< _numTerms ; i++) { for(int j=0; j < _numDocs ; j++) _termWeight[i][j]=ComputeTermWeight (i, j); } } private float GetTermFrequency(int term, int doc) { int freq=_termFreq [term][doc]; int maxfreq=_maxTermFreq[doc]; return ( (float) freq/(float)maxfreq ); } private float GetInverseDocumentFrequency(int term) { int df=_docFreq[term]; return Log((float) (_numDocs) / (float) df ); } private float ComputeTermWeight(int term, int doc) { float tf=GetTermFrequency (term, doc); float idf=GetInverseDocumentFrequency(term); return tf * idf; } private float[] GetTermVector(int doc) { float[] w=new float[_numTerms] ; for (int i=0; i < _numTerms; i++) w[i]=_termWeight[i][doc]; return w; } public float GetSimilarity(int doc_i, int doc_j) { float[] vector1=GetTermVector (doc_i); float[] vector2=GetTermVector (doc_j); return TermVector.ComputeCosineSimilarity(vector1, vector2); } private IDictionary GetWordFrequency(string input) { string convertedInput=input.ToLower() ; Tokeniser tokenizer=new Tokeniser() ; String[] words=tokenizer.Partition(convertedInput); Array.Sort(words); String[] distinctWords=GetDistinctWords(words); IDictionary result=new Hashtable(); for (int i=0; i < distinctWords.Length; i++) { object tmp; tmp=CountWords(distinctWords[i], words); result[distinctWords[i]]=tmp; } return result; } private string[] GetDistinctWords(String[] input) { if (input == null) return new string[0]; else { ArrayList list=new ArrayList() ; for (int i=0; i < input.Length; i++) if (!list.Contains(input[i])) // N-GRAM SIMILARITY? list.Add(input[i]); return Tokeniser.ArrayListToArray(list) ; } } private int CountWords(string word, string[] words) { int itemIdx=Array.BinarySearch(words, word); if (itemIdx >0)
While (itemIdx > 0 & & words [itemIdx] .equals (word))
ItemIdx--
Int count=0
While (itemIdx
< words.Length && itemIdx >= 0)
{
If (words [itemIdx] .equals (word)) count++
ItemIdx++
If (itemIdx < words.Length)
If (! words [itemIdx] .equals (word)) break
}
Return count
}
}
Disadvantages:
Because there are so many feature vector words in an article, the whole vector dimension is very high, so the cost of calculation is too high to be suitable for the calculation of large amount of data.
SimHash principle:
The main idea of the algorithm is to reduce the dimension and map the high-dimensional feature vector into a f-bit fingerprint (fingerprint). By comparing the Hamming Distance of the f-bit fingerprint of the two articles to determine whether the article is repetitive or highly approximate. Because we can calculate the Hamming Distance in advance to save each article, and then directly through the Hamming Distance to calculate, so the speed is very fast for big data calculation.
Google is based on this algorithm to achieve duplicate checking of web files. Let's assume the following three paragraphs:
1,the cat sat on the mat
2,the cat sat on a mat
3,we all scream for ice cream
How to implement this hash algorithm? Taking the above three texts as an example, the whole process can be divided into the following six steps:
1. Select the number of bits of simhash, taking into account the storage cost and the size of the data set, such as 32 bits.
2. Initialize the members of simhash to 0.
3. The features of the original text are generally extracted by various ways of word segmentation. For example, for "the cat sat on the mat", the results are as follows: {"th", "he", "e", "c", "ca", "at", "t", "s", "sa", "o", "on", "n", "t", "m", "ma"}.
4. Use the traditional 32-bit hash function to calculate the hashcode of each word, for example: "th" .hash =-502157718
, "he". Hash =-369049682,.
5. For each hashcode bit of each word, if the bit is 1, the value of the corresponding bit of simhash is increased by 1; otherwise, minus 1
6. For the final 32-bit simhash, if the bit is greater than 1, it is set to 1, otherwise it is set to 0
Thank you for your reading, the above is the content of "how to write the text similarity algorithm CoSine Theorem and SimHash". After the study of this article, I believe you have a deeper understanding of how to write the text similarity algorithm CoSine Theorem and SimHash under .NET, and the specific use needs to be verified in practice. Here is, the editor will push for you more related knowledge points of the article, welcome to follow!
Welcome to subscribe "Shulou Technology Information " to get latest news, interesting things and hot topics in the IT industry, and controls the hottest and latest Internet news, technology news and IT industry trends.
Views: 0
*The comments in the above article only represent the author's personal views and do not represent the views and positions of this website. If you have more insights, please feel free to contribute and share.
Continue with the installation of the previous hadoop.First, install zookooper1. Decompress zookoope
"Every 5-10 years, there's a rare product, a really special, very unusual product that's the most un
© 2024 shulou.com SLNews company. All rights reserved.