1
1
import lodash , { chunk } from 'lodash'
2
+ import { get as getLevenshteinDistance } from 'fast-levenshtein'
2
3
import validator from 'validator'
3
4
import { FieldTranslatorFactory , OpensearchQueryParser } from '@crowd/opensearch'
4
5
import { PageData } from '@crowd/common'
@@ -27,6 +28,11 @@ import SegmentRepository from './segmentRepository'
27
28
28
29
const { Op } = Sequelize
29
30
31
+ interface IOrganizationIdentityOpensearch {
32
+ string_platform : string
33
+ string_name : string
34
+ }
35
+
30
36
interface IOrganizationPartialAggregatesOpensearch {
31
37
_source : {
32
38
uuid_organizationId : string
@@ -38,10 +44,12 @@ interface IOrganizationPartialAggregatesOpensearch {
38
44
}
39
45
}
40
46
41
- interface IOrganizationIdOpensearch {
47
+ interface ISimilarOrganization {
42
48
_score : number
43
49
_source : {
44
50
uuid_organizationId : string
51
+ nested_identities : IOrganizationIdentityOpensearch [ ]
52
+ nested_weakIdentities : IOrganizationIdentityOpensearch [ ]
45
53
}
46
54
}
47
55
@@ -54,8 +62,6 @@ interface IOrganizationNoMerge {
54
62
noMergeId : string
55
63
}
56
64
57
- type MinMaxScores = { maxScore : number ; minScore : number }
58
-
59
65
class OrganizationRepository {
60
66
static async filterByPayingTenant (
61
67
tenantId : string ,
@@ -1162,28 +1168,48 @@ class OrganizationRepository {
1162
1168
return 10
1163
1169
}
1164
1170
1165
- const normalizeScore = ( max : number , min : number , score : number ) : number => {
1166
- if ( score > 100 ) {
1167
- return 1
1168
- }
1171
+ const calculateSimilarity = (
1172
+ primaryOrganization : IOrganizationPartialAggregatesOpensearch ,
1173
+ similarOrganization : ISimilarOrganization ,
1174
+ ) : number => {
1175
+ let smallestEditDistance : number = null
1169
1176
1170
- if ( max === min ) {
1171
- return ( 40 + Math . floor ( Math . random ( ) * 26 ) - 10 ) / 100
1172
- }
1173
-
1174
- const normalizedScore = ( score - min ) / ( max - min )
1177
+ let similarPrimaryIdentity : IOrganizationIdentityOpensearch = null
1175
1178
1176
- // randomize the cases where score === max and score === min
1177
- if ( normalizedScore === 1 ) {
1178
- return Math . floor ( Math . random ( ) * ( 76 - 50 ) + 50 ) / 100
1179
+ // find the smallest edit distance between both identity arrays
1180
+ for ( const primaryIdentity of primaryOrganization . _source . nested_identities ) {
1181
+ // similar organization has a weakIdentity as one of primary organization's strong identity, return score 95
1182
+ if (
1183
+ similarOrganization . _source . nested_weakIdentities . length > 0 &&
1184
+ similarOrganization . _source . nested_weakIdentities . some (
1185
+ ( weakIdentity ) =>
1186
+ weakIdentity . string_name === primaryIdentity . string_name &&
1187
+ weakIdentity . string_platform === primaryIdentity . string_platform ,
1188
+ )
1189
+ ) {
1190
+ return 0.95
1191
+ }
1192
+ for ( const secondaryIdentity of similarOrganization . _source . nested_identities ) {
1193
+ const currentLevenstheinDistance = getLevenshteinDistance (
1194
+ primaryIdentity . string_name ,
1195
+ secondaryIdentity . string_name ,
1196
+ )
1197
+ if ( smallestEditDistance === null || smallestEditDistance > currentLevenstheinDistance ) {
1198
+ smallestEditDistance = currentLevenstheinDistance
1199
+ similarPrimaryIdentity = primaryIdentity
1200
+ }
1201
+ }
1179
1202
}
1180
1203
1181
- // normalization is resolved to 0, randomize it
1182
- if ( normalizedScore === 0 ) {
1183
- return Math . floor ( Math . random ( ) * ( 41 - 20 ) + 20 ) / 100
1204
+ // calculate similarity percentage
1205
+ const identityLength = similarPrimaryIdentity . string_name . length
1206
+
1207
+ if ( identityLength < smallestEditDistance ) {
1208
+ // if levensthein distance is bigger than the word itself, it might be a prefix match, return medium similarity
1209
+ return ( Math . floor ( Math . random ( ) * 21 ) + 20 ) / 100
1184
1210
}
1185
1211
1186
- return normalizedScore
1212
+ return Math . floor ( ( ( identityLength - smallestEditDistance ) / identityLength ) * 100 ) / 100
1187
1213
}
1188
1214
1189
1215
const tenant = SequelizeRepository . getCurrentTenant ( options )
@@ -1433,17 +1459,18 @@ class OrganizationRepository {
1433
1459
collapse : {
1434
1460
field : 'uuid_organizationId' ,
1435
1461
} ,
1436
- _source : [ 'uuid_organizationId' ] ,
1462
+ _source : [ 'uuid_organizationId' , 'nested_identities' , 'nested_weakIdentities' ] ,
1437
1463
}
1438
1464
1439
- const organizationsToMerge : IOrganizationIdOpensearch [ ] =
1465
+ const organizationsToMerge : ISimilarOrganization [ ] =
1440
1466
(
1441
1467
await options . opensearch . search ( {
1442
1468
index : OpenSearchIndex . ORGANIZATIONS ,
1443
1469
body : sameOrganizationsQueryBody ,
1444
1470
} )
1445
1471
) . body ?. hits ?. hits || [ ]
1446
1472
1473
+ /*
1447
1474
const { maxScore, minScore } = organizationsToMerge.reduce<MinMaxScores>(
1448
1475
(acc, organizationToMerge) => {
1449
1476
if (!acc.minScore || organizationToMerge._score < acc.minScore) {
@@ -1458,10 +1485,11 @@ class OrganizationRepository {
1458
1485
},
1459
1486
{ maxScore: null, minScore: null },
1460
1487
)
1488
+ */
1461
1489
1462
1490
for ( const organizationToMerge of organizationsToMerge ) {
1463
1491
yieldChunk . push ( {
1464
- similarity : normalizeScore ( maxScore , minScore , organizationToMerge . _score ) ,
1492
+ similarity : calculateSimilarity ( organization , organizationToMerge ) ,
1465
1493
organizations : [
1466
1494
organization . _source . uuid_organizationId ,
1467
1495
organizationToMerge . _source . uuid_organizationId ,
@@ -1541,7 +1569,7 @@ class OrganizationRepository {
1541
1569
organizations : [ i , organizationToMergeResults [ idx ] ] ,
1542
1570
similarity : orgs [ idx ] . similarity ,
1543
1571
} ) )
1544
- return { rows : result , count : orgs [ 0 ] . total_count / 2 , limit, offset }
1572
+ return { rows : result , count : orgs [ 0 ] . total_count , limit, offset }
1545
1573
}
1546
1574
1547
1575
return { rows : [ { organizations : [ ] , similarity : 0 } ] , count : 0 , limit, offset }
0 commit comments