// Stored procedure: IsItUnique
CREATE PROCEDURE [dbo].[IsItUnique]
@Subject NVARCHAR(255),
@Message NVARCHAR(MAX),
@Score FLOAT OUT,
@Result BIT OUT
AS
BEGIN
SET NOCOUNT ON;
DECLARE @SS INT
SET @SS = 80
DECLARE records CURSOR FAST_FORWARD FOR SELECT Id FROM Posts WHERE dbo.SimilarityScore(dbo.CleanString(dbo.CleanString(dbo.CleanString(@Subject, 're:'), 'RE:'), 'Re:'), Subject) >= @SS
SET @Result = 1
OPEN records
DECLARE @Id UNIQUEIDENTIFIER
FETCH NEXT FROM records INTO @Id
WHILE @@FETCH_STATUS = 0
BEGIN
IF @Result = 1
BEGIN
DECLARE @Text NVARCHAR(MAX)
SELECT @Text = dbo.PostMessages.Text FROM dbo.PostMessages WHERE dbo.PostMessages.PostId = @Id
SET @Score = dbo.SimilarityScore(@Message, @Text)
IF @Score >= @SS
BEGIN
SET @Result = 0
END
END
FETCH NEXT FROM records INTO @Id
END
CLOSE records
DEALLOCATE records
END
// Function: SimilarityScore
CREATE FUNCTION [dbo].[SimilarityScore]
(
@Prospect NVARCHAR(MAX),
@Existing NVARCHAR(MAX)
)
RETURNS FLOAT
AS
BEGIN
DECLARE @Score FLOAT
DECLARE @Diff INT
SET @Diff = dbo.DifferenceScore(@Prospect, @Existing)
DECLARE @Tokens INT
SELECT @Tokens = COUNT(dbo.WordTokens.Item) FROM dbo.WordTokens(@Prospect, DEFAULT, DEFAULT)
RETURN ((@Tokens - @Diff) * 100) / @Tokens
END
// Function: WordTokens
CREATE FUNCTION [dbo].[WordTokens]
(
@string NVARCHAR(MAX),
@WordStartCharacters VARCHAR(255) = 'a-z',
@WordCharacters VARCHAR(255) = '-a-z'''
)
RETURNS @Results TABLE
(
SeqNo INT IDENTITY(1, 1),
Item VARCHAR(255),
TokenType INT
)
AS /*
This table function produces a table which divides up the words and
the spaces between the words in some text and produces a table of the
two types of token in the sequence in which they were found
*/
BEGIN
DECLARE @Pos INT, --index of current search
@WhereWeAre INT,--index into string so far
@ii INT, --the number of words found so far
@next INT, --where the next search starts
@size INT --the total size of the text
SELECT @ii = 0, @WhereWeAre = 1, @size = DATALENGTH(@string)
WHILE @Size >= @WhereWeAre
BEGIN
SELECT @pos = PATINDEX('%[' + @wordStartCharacters + ']%',
SUBSTRING(@string, @whereWeAre, 4000))
IF @pos > 0
BEGIN
IF @pos > 1
INSERT INTO @Results
( item, tokentype )
SELECT SUBSTRING(@String, @whereWeAre, @pos - 1), 2
SELECT @next = @WhereWeAre + @pos, @ii = @ii + 1
SELECT @pos = PATINDEX('%[^' + @wordCharacters + ']%',
SUBSTRING(@string, @next, 4000) + ' ')
INSERT INTO @Results
( item, tokentype )
SELECT SUBSTRING(@String, @next - 1, @pos), 1
SELECT @WhereWeAre = @next + @pos - 1
END
ELSE
BEGIN
IF LEN(REPLACE(
SUBSTRING(@String, @whereWeAre, 4000), ' ', '!'
)) > 0
INSERT INTO @Results
( item, tokentype )
SELECT SUBSTRING(@String, @whereWeAre, 4000), 2
SELECT @whereWeAre = @WhereWeAre + 4000
END
END
RETURN
END
// Function: DifferenceScore
CREATE FUNCTION [dbo].[DifferenceScore]
(
@Sample NVARCHAR(MAX),
@comparison NVARCHAR(MAX)
)
RETURNS INT
AS BEGIN
DECLARE @results TABLE
(
token_ID INT IDENTITY(1, 1),
sequenceNumber INT,
Sample_ID INT,
Item VARCHAR(255),
TokenType INT
)
/*
This function returns the number of differences it found between two pieces
of text
*/
INSERT INTO @results
( SequenceNumber, Sample_ID, Item, Tokentype )
SELECT seqno, 1, item, tokentype
FROM dbo.WordTokens(@sample, DEFAULT, DEFAULT)
INSERT INTO @results
( SequenceNumber, Sample_ID, Item, Tokentype )
SELECT seqno, 2, item, tokentype
FROM dbo.WordTokens(@comparison, DEFAULT, DEFAULT)
DECLARE @closestMatch TABLE
(
sequenceNumber INT,
skew INT
)
INSERT INTO @closestMatch
( sequencenumber, skew )
SELECT COALESCE(a.sequencenumber, b.sequencenumber),
COALESCE(MIN(ABS(COALESCE(b.sequenceNumber, 1000)
- COALESCE(a.sequencenumber, 1000))),
-1)
FROM ( SELECT *
FROM @results
WHERE sample_ID = 1 AND tokentype = 1
) a FULL OUTER JOIN ( SELECT *
FROM @results
WHERE sample_ID = 2
AND tokentype = 1
) b ON a.item = b.item
GROUP BY COALESCE(a.sequencenumber, b.sequencenumber)
ORDER BY COALESCE(a.sequencenumber, b.sequencenumber)
RETURN ( SELECT SUM(CASE WHEN a.skew - b.skew = 0 THEN 0
ELSE 1
END)
FROM @closestmatch a INNER JOIN @closestMatch b
ON b.sequenceNumber = a.sequenceNumber + 2
)
END
// CLR Function: CleanString
public partial class UserDefinedFunctions {
[SqlFunction]
public static SqlString CleanString(SqlString Value, SqlString Token) {
return Value.ToString().Replace(Token.ToString(), "").Trim();
}
};
Refactorings
No refactoring yet !
John
November 22, 2010, November 22, 2010 20:47, permalink
I took your code which produces similarity scores for text (because it looks at the similarity of just the words themselves) and altered it to produce similarity score for the letters within words themselves. I'm sure there are better and more complex algorithms for this but it's a start.
--Function: SimilarityScore
CREATE FUNCTION [dbo].[SimilarityScore]
(
@Prospect VARCHAR(MAX),
@Existing VARCHAR(MAX)
)
RETURNS FLOAT
AS
BEGIN
DECLARE @Score FLOAT
DECLARE @Diff INT
SET @Diff = dbo.DifferenceScore(@Prospect, @Existing)
DECLARE @Tokens INT
SELECT @Tokens = COUNT(dbo.WordTokens.Item) FROM dbo.WordTokens(@Prospect)
SELECT @Tokens = @Tokens+COUNT(dbo.WordTokens.Item) FROM dbo.WordTokens(@Existing)
RETURN ((CAST(@Tokens AS FLOAT) - CAST(@Diff AS FLOAT)) * 100) / CAST(@Tokens AS FLOAT)
END
--Function: WordTokens
CREATE FUNCTION [dbo].[WordTokens]
(
@string VARCHAR(MAX)
)
RETURNS @Results TABLE
(
SeqNo INT IDENTITY(1, 1),
Item VARCHAR(255)
)
AS /*
This table function produces a table which divides up the words and
the spaces between the words in some text and produces a table of the
two types of token in the sequence in which they were found
*/
BEGIN
DECLARE @i INT, --the number of words found so far
@size INT --the total size of the text
SELECT @i = 1, @size = DATALENGTH(@string)
WHILE @Size >= @i
BEGIN
INSERT INTO @Results
(item)
SELECT SUBSTRING(@String,@i,1)
SET @i=@i+1
END
RETURN
END
--Function: DifferenceScore
CREATE FUNCTION [dbo].[DifferenceScore]
(
@Sample VARCHAR(MAX),
@comparison VARCHAR(MAX)
)
RETURNS INT
AS BEGIN
DECLARE @results TABLE
(
token_ID INT IDENTITY(1, 1),
sequenceNumber INT,
Sample_ID INT,
Item VARCHAR(255)
)
/*
This function returns the number of differences it found between two pieces
of text
*/
INSERT INTO @results
(SequenceNumber, Sample_ID, Item)
SELECT seqno, 1, item
FROM dbo.WordTokens(@sample)
INSERT INTO @results
(SequenceNumber, Sample_ID, Item)
SELECT seqno, 2, item
FROM dbo.WordTokens(@comparison)
DECLARE @closestMatch TABLE
(
sequenceNumber INT,
skew INT
)
INSERT INTO @closestMatch
(sequencenumber, skew)
SELECT COALESCE(a.sequencenumber, b.sequencenumber),
COALESCE(MIN(ABS(COALESCE(b.sequenceNumber, 1000)
- COALESCE(a.sequencenumber, 1000))),
-1)
FROM ( SELECT *
FROM @results
WHERE sample_ID = 1
) a FULL OUTER JOIN ( SELECT *
FROM @results
WHERE sample_ID = 2
) b ON a.item = b.item
GROUP BY COALESCE(a.sequencenumber, b.sequencenumber)
ORDER BY COALESCE(a.sequencenumber, b.sequencenumber)
RETURN (SELECT SUM(CASE WHEN a.skew - b.skew = 0 THEN 0
ELSE 1
END)
FROM @closestmatch a INNER JOIN @closestMatch b
ON b.sequenceNumber = a.sequenceNumber + 2
)
END
First of all I apologize for posting this in C# it should be in SQL (which I am not able to find).
What I am trying to do here is to calculate similarity between two piece of text via a stored procedure.