Skip to content

Commit 5e82450

Browse files
authored
Post processing edit distances for more precise similarity scores (#1692)
1 parent 7e266c7 commit 5e82450

File tree

4 files changed

+105
-28
lines changed

4 files changed

+105
-28
lines changed

backend/package-lock.json

Lines changed: 50 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

backend/package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,10 @@
5454
"@crowd/common": "file:../services/libs/common",
5555
"@crowd/integrations": "file:../services/libs/integrations",
5656
"@crowd/logging": "file:../services/libs/logging",
57-
"@crowd/tracing": "file:../services/libs/tracing",
5857
"@crowd/opensearch": "file:../services/libs/opensearch",
5958
"@crowd/redis": "file:../services/libs/redis",
6059
"@crowd/sqs": "file:../services/libs/sqs",
60+
"@crowd/tracing": "file:../services/libs/tracing",
6161
"@crowd/types": "file:../services/libs/types",
6262
"@cubejs-client/core": "^0.30.4",
6363
"@google-cloud/storage": "5.3.0",
@@ -97,6 +97,7 @@
9797
"erlpack": "^0.1.4",
9898
"express": "4.17.1",
9999
"express-rate-limit": "6.5.1",
100+
"fast-levenshtein": "^3.0.0",
100101
"formidable-serverless": "1.1.1",
101102
"he": "^1.2.0",
102103
"helmet": "4.1.1",

backend/src/database/repositories/organizationRepository.ts

Lines changed: 51 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import lodash, { chunk } from 'lodash'
2+
import { get as getLevenshteinDistance } from 'fast-levenshtein'
23
import validator from 'validator'
34
import { FieldTranslatorFactory, OpensearchQueryParser } from '@crowd/opensearch'
45
import { PageData } from '@crowd/common'
@@ -27,6 +28,11 @@ import SegmentRepository from './segmentRepository'
2728

2829
const { Op } = Sequelize
2930

31+
interface IOrganizationIdentityOpensearch {
32+
string_platform: string
33+
string_name: string
34+
}
35+
3036
interface IOrganizationPartialAggregatesOpensearch {
3137
_source: {
3238
uuid_organizationId: string
@@ -38,10 +44,12 @@ interface IOrganizationPartialAggregatesOpensearch {
3844
}
3945
}
4046

41-
interface IOrganizationIdOpensearch {
47+
interface ISimilarOrganization {
4248
_score: number
4349
_source: {
4450
uuid_organizationId: string
51+
nested_identities: IOrganizationIdentityOpensearch[]
52+
nested_weakIdentities: IOrganizationIdentityOpensearch[]
4553
}
4654
}
4755

@@ -54,8 +62,6 @@ interface IOrganizationNoMerge {
5462
noMergeId: string
5563
}
5664

57-
type MinMaxScores = { maxScore: number; minScore: number }
58-
5965
class OrganizationRepository {
6066
static async filterByPayingTenant(
6167
tenantId: string,
@@ -1162,28 +1168,48 @@ class OrganizationRepository {
11621168
return 10
11631169
}
11641170

1165-
const normalizeScore = (max: number, min: number, score: number): number => {
1166-
if (score > 100) {
1167-
return 1
1168-
}
1171+
const calculateSimilarity = (
1172+
primaryOrganization: IOrganizationPartialAggregatesOpensearch,
1173+
similarOrganization: ISimilarOrganization,
1174+
): number => {
1175+
let smallestEditDistance: number = null
11691176

1170-
if (max === min) {
1171-
return (40 + Math.floor(Math.random() * 26) - 10) / 100
1172-
}
1173-
1174-
const normalizedScore = (score - min) / (max - min)
1177+
let similarPrimaryIdentity: IOrganizationIdentityOpensearch = null
11751178

1176-
// randomize the cases where score === max and score === min
1177-
if (normalizedScore === 1) {
1178-
return Math.floor(Math.random() * (76 - 50) + 50) / 100
1179+
// find the smallest edit distance between both identity arrays
1180+
for (const primaryIdentity of primaryOrganization._source.nested_identities) {
1181+
// similar organization has a weakIdentity as one of primary organization's strong identity, return score 95
1182+
if (
1183+
similarOrganization._source.nested_weakIdentities.length > 0 &&
1184+
similarOrganization._source.nested_weakIdentities.some(
1185+
(weakIdentity) =>
1186+
weakIdentity.string_name === primaryIdentity.string_name &&
1187+
weakIdentity.string_platform === primaryIdentity.string_platform,
1188+
)
1189+
) {
1190+
return 0.95
1191+
}
1192+
for (const secondaryIdentity of similarOrganization._source.nested_identities) {
1193+
const currentLevenstheinDistance = getLevenshteinDistance(
1194+
primaryIdentity.string_name,
1195+
secondaryIdentity.string_name,
1196+
)
1197+
if (smallestEditDistance === null || smallestEditDistance > currentLevenstheinDistance) {
1198+
smallestEditDistance = currentLevenstheinDistance
1199+
similarPrimaryIdentity = primaryIdentity
1200+
}
1201+
}
11791202
}
11801203

1181-
// normalization is resolved to 0, randomize it
1182-
if (normalizedScore === 0) {
1183-
return Math.floor(Math.random() * (41 - 20) + 20) / 100
1204+
// calculate similarity percentage
1205+
const identityLength = similarPrimaryIdentity.string_name.length
1206+
1207+
if (identityLength < smallestEditDistance) {
1208+
// if levensthein distance is bigger than the word itself, it might be a prefix match, return medium similarity
1209+
return (Math.floor(Math.random() * 21) + 20) / 100
11841210
}
11851211

1186-
return normalizedScore
1212+
return Math.floor(((identityLength - smallestEditDistance) / identityLength) * 100) / 100
11871213
}
11881214

11891215
const tenant = SequelizeRepository.getCurrentTenant(options)
@@ -1433,17 +1459,18 @@ class OrganizationRepository {
14331459
collapse: {
14341460
field: 'uuid_organizationId',
14351461
},
1436-
_source: ['uuid_organizationId'],
1462+
_source: ['uuid_organizationId', 'nested_identities', 'nested_weakIdentities'],
14371463
}
14381464

1439-
const organizationsToMerge: IOrganizationIdOpensearch[] =
1465+
const organizationsToMerge: ISimilarOrganization[] =
14401466
(
14411467
await options.opensearch.search({
14421468
index: OpenSearchIndex.ORGANIZATIONS,
14431469
body: sameOrganizationsQueryBody,
14441470
})
14451471
).body?.hits?.hits || []
14461472

1473+
/*
14471474
const { maxScore, minScore } = organizationsToMerge.reduce<MinMaxScores>(
14481475
(acc, organizationToMerge) => {
14491476
if (!acc.minScore || organizationToMerge._score < acc.minScore) {
@@ -1458,10 +1485,11 @@ class OrganizationRepository {
14581485
},
14591486
{ maxScore: null, minScore: null },
14601487
)
1488+
*/
14611489

14621490
for (const organizationToMerge of organizationsToMerge) {
14631491
yieldChunk.push({
1464-
similarity: normalizeScore(maxScore, minScore, organizationToMerge._score),
1492+
similarity: calculateSimilarity(organization, organizationToMerge),
14651493
organizations: [
14661494
organization._source.uuid_organizationId,
14671495
organizationToMerge._source.uuid_organizationId,
@@ -1541,7 +1569,7 @@ class OrganizationRepository {
15411569
organizations: [i, organizationToMergeResults[idx]],
15421570
similarity: orgs[idx].similarity,
15431571
}))
1544-
return { rows: result, count: orgs[0].total_count / 2, limit, offset }
1572+
return { rows: result, count: orgs[0].total_count, limit, offset }
15451573
}
15461574

15471575
return { rows: [{ organizations: [], similarity: 0 }], count: 0, limit, offset }

services/apps/search_sync_worker/src/repo/organization.repo.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,8 @@ export class OrganizationRepository extends RepositoryBase<OrganizationRepositor
113113
md."memberCount",
114114
i.identities,
115115
coalesce(tmd.to_merge_ids, array []::text[]) as "toMergeIds",
116-
coalesce(nmd.no_merge_ids, array []::text[]) as "noMergeIds"
116+
coalesce(nmd.no_merge_ids, array []::text[]) as "noMergeIds",
117+
o."weakIdentities"
117118
from organizations o
118119
left join member_data md on o.id = md."organizationId"
119120
left join identities i on o.id = i."organizationId"

0 commit comments

Comments
 (0)