Определите многочисленные написания в R
library(dplyr)
library(stringdist)
Puzzled Porpoise
library(dplyr)
library(stringdist)
dta <- cities %>% group_by(city) %>% count() %>%
ungroup() %>% mutate(i = row_number())
> pairs
# x y cityx nx cityy ny similarity
# 1 1 2 bangalore 4 banglore 1 0.9629630
# 2 4 5 new york 3 newyork 1 0.9583333
# 3 5 6 newyork 1 nyork 1 0.9047619
# 4 4 6 new york 3 nyork 1 0.8750000
# 5 2 3 banglore 1 bengaluru 1 0.7222222
# 6 1 3 bangalore 4 bengaluru 1 0.6944444
# 7 2 6 banglore 1 nyork 1 0.6583333
# 8 2 5 banglore 1 newyork 1 0.6011905
# 9 1 5 bangalore 4 newyork 1 0.5873016
# 10 2 4 banglore 1 new york 3 0.5833333
# 11 1 4 bangalore 4 new york 3 0.5694444
# 12 3 5 bengaluru 1 newyork 1 0.4761905
# 13 3 4 bengaluru 1 new york 3 0.4583333
# 14 1 6 bangalore 4 nyork 1 0.4370370
# 15 3 6 bengaluru 1 nyork 1 0.4370370
cities <- data.frame(city = c('bangalore','bengaluru','banglore',
'bangalore', 'bangalore', 'bangalore', 'new york', 'newyork',
'nyork', 'new york', 'new york'))
pairs <- expand.grid(x = seq_len(nrow(dta)), y = seq_len(nrow(dta))) %>%
# Only need to compare i to all records j, with j > i
filter(y > x) %>%
left_join(dta, by = c(x = 'i')) %>% rename(cityx = city, nx = n) %>%
left_join(dta, by = c(y = 'i')) %>% rename(cityy = city, ny = n) %>%
mutate(similarity = stringsim(cityx, cityy, method = "jw")) %>%
arrange(desc(similarity))