42d18a359135982c319658d02c826bfb

First of all I apologize for posting this in C# it should be in SQL (which I am not able to find).

What I am trying to do here is to calculate similarity between two piece of text via a stored procedure.

// Stored procedure: IsItUnique

CREATE PROCEDURE [dbo].[IsItUnique]
	@Subject NVARCHAR(255),
	@Message NVARCHAR(MAX),
	@Score FLOAT OUT,
	@Result BIT OUT
AS
BEGIN
	SET NOCOUNT ON;

	DECLARE @SS INT
	SET @SS = 80
	
	DECLARE records CURSOR FAST_FORWARD FOR SELECT Id FROM Posts WHERE dbo.SimilarityScore(dbo.CleanString(dbo.CleanString(dbo.CleanString(@Subject, 're:'), 'RE:'), 'Re:'), Subject) >= @SS

	SET @Result = 1
	OPEN records
		DECLARE @Id UNIQUEIDENTIFIER
		
		FETCH NEXT FROM records INTO @Id
		
		WHILE @@FETCH_STATUS = 0
		BEGIN
			
			IF @Result = 1
			BEGIN
				DECLARE @Text NVARCHAR(MAX)
				SELECT @Text = dbo.PostMessages.Text FROM dbo.PostMessages WHERE dbo.PostMessages.PostId = @Id
				
				SET @Score = dbo.SimilarityScore(@Message, @Text)
				
				IF @Score >= @SS
				BEGIN
					SET @Result = 0
				END
			END
			
			FETCH NEXT FROM records INTO @Id
		END

	CLOSE records
	DEALLOCATE records
END

// Function: SimilarityScore

CREATE FUNCTION [dbo].[SimilarityScore]
(
	@Prospect NVARCHAR(MAX),
	@Existing NVARCHAR(MAX)
)
RETURNS FLOAT
AS
BEGIN
	DECLARE @Score FLOAT

	DECLARE @Diff INT
	SET @Diff = dbo.DifferenceScore(@Prospect, @Existing)
	
	DECLARE @Tokens INT
	SELECT @Tokens = COUNT(dbo.WordTokens.Item) FROM dbo.WordTokens(@Prospect, DEFAULT, DEFAULT)

	RETURN ((@Tokens - @Diff) * 100) / @Tokens

END

// Function: WordTokens

CREATE FUNCTION [dbo].[WordTokens]
  (
    @string NVARCHAR(MAX),
    @WordStartCharacters VARCHAR(255) = 'a-z',
    @WordCharacters VARCHAR(255) = '-a-z'''
  )
RETURNS @Results TABLE
  (
    SeqNo INT IDENTITY(1, 1),
    Item VARCHAR(255),
    TokenType INT
  )
AS /*
This table function produces a table which divides up the words and 
the spaces between the words in some text and produces a table of the
two types of token in the sequence in which they were found
*/
   BEGIN
    DECLARE @Pos INT,    --index of current search
      @WhereWeAre INT,--index into string so far
      @ii INT,    --the number of words found so far
      @next INT,  --where the next search starts 
      @size INT   --the total size of the text

    SELECT  @ii = 0, @WhereWeAre = 1, @size = DATALENGTH(@string)


    WHILE @Size >= @WhereWeAre
      BEGIN
        SELECT  @pos = PATINDEX('%[' + @wordStartCharacters + ']%',
                                SUBSTRING(@string, @whereWeAre, 4000))
        IF @pos > 0 
          BEGIN
            IF @pos > 1 
              INSERT  INTO @Results
                      ( item, tokentype )
                      SELECT  SUBSTRING(@String, @whereWeAre, @pos - 1), 2
            SELECT  @next = @WhereWeAre + @pos, @ii = @ii + 1
            SELECT  @pos = PATINDEX('%[^' + @wordCharacters + ']%',
                                    SUBSTRING(@string, @next, 4000) + ' ')
            INSERT  INTO @Results
                    ( item, tokentype )
                    SELECT  SUBSTRING(@String, @next - 1, @pos), 1
            SELECT  @WhereWeAre = @next + @pos - 1
          END
        ELSE 
          BEGIN
            IF LEN(REPLACE(
                    SUBSTRING(@String, @whereWeAre, 4000), ' ', '!'
            )) > 0 
              INSERT  INTO @Results
                      ( item, tokentype )
                      SELECT  SUBSTRING(@String, @whereWeAre, 4000), 2
            SELECT  @whereWeAre = @WhereWeAre + 4000
          END
      END
    RETURN
   END 

// Function: DifferenceScore

CREATE FUNCTION [dbo].[DifferenceScore]
  (
    @Sample NVARCHAR(MAX),
    @comparison NVARCHAR(MAX)
  )
RETURNS INT
AS BEGIN
    DECLARE @results TABLE
      (
        token_ID INT IDENTITY(1, 1),
        sequenceNumber INT,
        Sample_ID INT,
        Item VARCHAR(255),
        TokenType INT
      )
/*
This function returns the number of differences it found between two pieces
of text
*/
    INSERT  INTO @results
            ( SequenceNumber, Sample_ID, Item, Tokentype )
            SELECT  seqno, 1, item, tokentype
            FROM    dbo.WordTokens(@sample, DEFAULT, DEFAULT)

    INSERT  INTO @results
            ( SequenceNumber, Sample_ID, Item, Tokentype )
            SELECT  seqno, 2, item, tokentype
            FROM    dbo.WordTokens(@comparison, DEFAULT, DEFAULT)
    DECLARE @closestMatch TABLE
      (
        sequenceNumber INT,
        skew INT
      )
    INSERT  INTO @closestMatch
            ( sequencenumber, skew )
            SELECT  COALESCE(a.sequencenumber, b.sequencenumber),
                    COALESCE(MIN(ABS(COALESCE(b.sequenceNumber, 1000) 
                        - COALESCE(a.sequencenumber, 1000))),
                             -1)
            FROM    ( SELECT  *
                      FROM    @results
                      WHERE   sample_ID = 1 AND tokentype = 1
                    ) a FULL OUTER JOIN ( SELECT  *
                                          FROM    @results
                                          WHERE   sample_ID = 2 
                                              AND tokentype = 1
                                        ) b ON a.item = b.item
            GROUP BY COALESCE(a.sequencenumber, b.sequencenumber)
            ORDER BY COALESCE(a.sequencenumber, b.sequencenumber)



    RETURN ( SELECT SUM(CASE WHEN a.skew - b.skew = 0 THEN 0
                             ELSE 1
                        END)
             FROM   @closestmatch a INNER JOIN @closestMatch b 
                  ON b.sequenceNumber = a.sequenceNumber + 2
           )
   END

// CLR Function: CleanString

public partial class UserDefinedFunctions {
	[SqlFunction]
	public static SqlString CleanString(SqlString Value, SqlString Token) {
		return Value.ToString().Replace(Token.ToString(), "").Trim();
	}
};

Refactorings

No refactoring yet !

39e6d6bdacb7a70e7793917f10f52bbf

school grants

April 5, 2010, April 05, 2010 12:25, permalink

No rating. Login to rate!

What a great resource!

7e079deced874546394fd894a54bfbd7

John

November 22, 2010, November 22, 2010 20:47, permalink

No rating. Login to rate!

I took your code which produces similarity scores for text (because it looks at the similarity of just the words themselves) and altered it to produce similarity score for the letters within words themselves. I'm sure there are better and more complex algorithms for this but it's a start.

--Function: SimilarityScore

CREATE FUNCTION [dbo].[SimilarityScore]
(
	@Prospect VARCHAR(MAX),
	@Existing VARCHAR(MAX)
)
RETURNS FLOAT
AS
BEGIN
	DECLARE @Score FLOAT

	DECLARE @Diff INT
	SET @Diff = dbo.DifferenceScore(@Prospect, @Existing)
	
	DECLARE @Tokens INT
	SELECT @Tokens = COUNT(dbo.WordTokens.Item) FROM dbo.WordTokens(@Prospect)
	SELECT @Tokens = @Tokens+COUNT(dbo.WordTokens.Item) FROM dbo.WordTokens(@Existing)

	RETURN ((CAST(@Tokens AS FLOAT) - CAST(@Diff AS FLOAT)) * 100) / CAST(@Tokens AS FLOAT)

END

--Function: WordTokens

CREATE FUNCTION [dbo].[WordTokens]
  (
    @string VARCHAR(MAX)
  )
RETURNS @Results TABLE
  (
    SeqNo INT IDENTITY(1, 1),
    Item VARCHAR(255)
  )
AS /*
This table function produces a table which divides up the words and 
the spaces between the words in some text and produces a table of the
two types of token in the sequence in which they were found
*/
   BEGIN
    DECLARE @i INT,    --the number of words found so far
      @size INT   --the total size of the text

    SELECT  @i = 1, @size = DATALENGTH(@string)

    WHILE @Size >= @i
      BEGIN
            INSERT  INTO @Results
            (item)
            SELECT SUBSTRING(@String,@i,1)
 			SET @i=@i+1
      END
    RETURN
   END 

--Function: DifferenceScore

CREATE FUNCTION [dbo].[DifferenceScore]
  (
    @Sample VARCHAR(MAX),
    @comparison VARCHAR(MAX)
  )
RETURNS INT
AS BEGIN
    DECLARE @results TABLE
      (
        token_ID INT IDENTITY(1, 1),
        sequenceNumber INT,
        Sample_ID INT,
        Item VARCHAR(255)
      )
/*
This function returns the number of differences it found between two pieces
of text
*/
    INSERT  INTO @results
            (SequenceNumber, Sample_ID, Item)
            SELECT  seqno, 1, item
            FROM    dbo.WordTokens(@sample)

    INSERT  INTO @results
            (SequenceNumber, Sample_ID, Item)
            SELECT  seqno, 2, item
            FROM    dbo.WordTokens(@comparison)
    DECLARE @closestMatch TABLE
      (
        sequenceNumber INT,
        skew INT
      )
    INSERT  INTO @closestMatch
            (sequencenumber, skew)
            SELECT  COALESCE(a.sequencenumber, b.sequencenumber),
                    COALESCE(MIN(ABS(COALESCE(b.sequenceNumber, 1000) 
                        - COALESCE(a.sequencenumber, 1000))),
                             -1)
            FROM    ( SELECT  *
                      FROM    @results
                      WHERE   sample_ID = 1
                    ) a FULL OUTER JOIN ( SELECT  *
                                          FROM    @results
                                          WHERE   sample_ID = 2
                                        ) b ON a.item = b.item
            GROUP BY COALESCE(a.sequencenumber, b.sequencenumber)
            ORDER BY COALESCE(a.sequencenumber, b.sequencenumber)

    RETURN (SELECT SUM(CASE WHEN a.skew - b.skew = 0 THEN 0
                             ELSE 1
                        END)
             FROM   @closestmatch a INNER JOIN @closestMatch b 
                  ON b.sequenceNumber = a.sequenceNumber + 2
           )
   END

Your refactoring





Format Copy from initial code

or Cancel