Post processing edit distances for more precise similarity scores (#1692)

epipav · web-flow · commit 5e8245037f2d · 2023-10-16T12:54:29.000+02:00
diff --git a/backend/package-lock.json b/backend/package-lock.json
diff --git a/backend/package.json b/backend/package.json
@@ -54,10 +54,10 @@
     "@crowd/common": "file:../services/libs/common",
     "@crowd/integrations": "file:../services/libs/integrations",
     "@crowd/logging": "file:../services/libs/logging",
-    "@crowd/tracing": "file:../services/libs/tracing",
     "@crowd/opensearch": "file:../services/libs/opensearch",
     "@crowd/redis": "file:../services/libs/redis",
     "@crowd/sqs": "file:../services/libs/sqs",
+    "@crowd/tracing": "file:../services/libs/tracing",
     "@crowd/types": "file:../services/libs/types",
     "@cubejs-client/core": "^0.30.4",
     "@google-cloud/storage": "5.3.0",
@@ -97,6 +97,7 @@
     "erlpack": "^0.1.4",
     "express": "4.17.1",
     "express-rate-limit": "6.5.1",
+    "fast-levenshtein": "^3.0.0",
     "formidable-serverless": "1.1.1",
     "he": "^1.2.0",
     "helmet": "4.1.1",
diff --git a/backend/src/database/repositories/organizationRepository.ts b/backend/src/database/repositories/organizationRepository.ts
@@ -1,4 +1,5 @@
 import lodash, { chunk } from 'lodash'
+import { get as getLevenshteinDistance } from 'fast-levenshtein'
 import validator from 'validator'
 import { FieldTranslatorFactory, OpensearchQueryParser } from '@crowd/opensearch'
 import { PageData } from '@crowd/common'
@@ -27,6 +28,11 @@ import SegmentRepository from './segmentRepository'
 
 const { Op } = Sequelize
 
+interface IOrganizationIdentityOpensearch {
+  string_platform: string
+  string_name: string
+}
+
 interface IOrganizationPartialAggregatesOpensearch {
   _source: {
     uuid_organizationId: string
@@ -38,10 +44,12 @@ interface IOrganizationPartialAggregatesOpensearch {
   }
 }
 
-interface IOrganizationIdOpensearch {
+interface ISimilarOrganization {
   _score: number
   _source: {
     uuid_organizationId: string
+    nested_identities: IOrganizationIdentityOpensearch[]
+    nested_weakIdentities: IOrganizationIdentityOpensearch[]
   }
 }
 
@@ -54,8 +62,6 @@ interface IOrganizationNoMerge {
   noMergeId: string
 }
 
-type MinMaxScores = { maxScore: number; minScore: number }
-
 class OrganizationRepository {
   static async filterByPayingTenant(
     tenantId: string,
@@ -1162,28 +1168,48 @@ class OrganizationRepository {
       return 10
     }
 
-    const normalizeScore = (max: number, min: number, score: number): number => {
-      if (score > 100) {
-        return 1
-      }
+    const calculateSimilarity = (
+      primaryOrganization: IOrganizationPartialAggregatesOpensearch,
+      similarOrganization: ISimilarOrganization,
+    ): number => {
+      let smallestEditDistance: number = null
 
-      if (max === min) {
-        return (40 + Math.floor(Math.random() * 26) - 10) / 100
-      }
-
-      const normalizedScore = (score - min) / (max - min)
+      let similarPrimaryIdentity: IOrganizationIdentityOpensearch = null
 
-      // randomize the cases where score === max and score === min
-      if (normalizedScore === 1) {
-        return Math.floor(Math.random() * (76 - 50) + 50) / 100
+      // find the smallest edit distance between both identity arrays
+      for (const primaryIdentity of primaryOrganization._source.nested_identities) {
+        // similar organization has a weakIdentity as one of primary organization's strong identity, return score 95
+        if (
+          similarOrganization._source.nested_weakIdentities.length > 0 &&
+          similarOrganization._source.nested_weakIdentities.some(
+            (weakIdentity) =>
+              weakIdentity.string_name === primaryIdentity.string_name &&
+              weakIdentity.string_platform === primaryIdentity.string_platform,
+          )
+        ) {
+          return 0.95
+        }
+        for (const secondaryIdentity of similarOrganization._source.nested_identities) {
+          const currentLevenstheinDistance = getLevenshteinDistance(
+            primaryIdentity.string_name,
+            secondaryIdentity.string_name,
+          )
+          if (smallestEditDistance === null || smallestEditDistance > currentLevenstheinDistance) {
+            smallestEditDistance = currentLevenstheinDistance
+            similarPrimaryIdentity = primaryIdentity
+          }
+        }
       }
 
-      // normalization is resolved to 0, randomize it
-      if (normalizedScore === 0) {
-        return Math.floor(Math.random() * (41 - 20) + 20) / 100
+      // calculate similarity percentage
+      const identityLength = similarPrimaryIdentity.string_name.length
+
+      if (identityLength < smallestEditDistance) {
+        // if levensthein distance is bigger than the word itself, it might be a prefix match, return medium similarity
+        return (Math.floor(Math.random() * 21) + 20) / 100
       }
 
-      return normalizedScore
+      return Math.floor(((identityLength - smallestEditDistance) / identityLength) * 100) / 100
     }
 
     const tenant = SequelizeRepository.getCurrentTenant(options)
@@ -1433,17 +1459,18 @@ class OrganizationRepository {
             collapse: {
               field: 'uuid_organizationId',
             },
-            _source: ['uuid_organizationId'],
+            _source: ['uuid_organizationId', 'nested_identities', 'nested_weakIdentities'],
           }
 
-          const organizationsToMerge: IOrganizationIdOpensearch[] =
+          const organizationsToMerge: ISimilarOrganization[] =
             (
               await options.opensearch.search({
                 index: OpenSearchIndex.ORGANIZATIONS,
                 body: sameOrganizationsQueryBody,
               })
             ).body?.hits?.hits || []
 
+          /*
           const { maxScore, minScore } = organizationsToMerge.reduce<MinMaxScores>(
             (acc, organizationToMerge) => {
               if (!acc.minScore || organizationToMerge._score < acc.minScore) {
@@ -1458,10 +1485,11 @@ class OrganizationRepository {
             },
             { maxScore: null, minScore: null },
           )
+          */
 
           for (const organizationToMerge of organizationsToMerge) {
             yieldChunk.push({
-              similarity: normalizeScore(maxScore, minScore, organizationToMerge._score),
+              similarity: calculateSimilarity(organization, organizationToMerge),
               organizations: [
                 organization._source.uuid_organizationId,
                 organizationToMerge._source.uuid_organizationId,
@@ -1541,7 +1569,7 @@ class OrganizationRepository {
         organizations: [i, organizationToMergeResults[idx]],
         similarity: orgs[idx].similarity,
       }))
-      return { rows: result, count: orgs[0].total_count / 2, limit, offset }
+      return { rows: result, count: orgs[0].total_count, limit, offset }
     }
 
     return { rows: [{ organizations: [], similarity: 0 }], count: 0, limit, offset }
diff --git a/services/apps/search_sync_worker/src/repo/organization.repo.ts b/services/apps/search_sync_worker/src/repo/organization.repo.ts
@@ -113,7 +113,8 @@ export class OrganizationRepository extends RepositoryBase<OrganizationRepositor
             md."memberCount",
             i.identities,
             coalesce(tmd.to_merge_ids, array []::text[])       as "toMergeIds",
-            coalesce(nmd.no_merge_ids, array []::text[])       as "noMergeIds"
+            coalesce(nmd.no_merge_ids, array []::text[])       as "noMergeIds",
+            o."weakIdentities"
       from organizations o
               left join member_data md on o.id = md."organizationId"
               left join identities i on o.id = i."organizationId"