Как рассчитать дельта (вставленные/удаленные/перемещенные индексы) двух списков?

Предполагая, что у меня есть два списка объектов с уникальными идентификаторами и атрибутом, определяющим их порядок, как я могу эффективно получить дельта-индексы (какие индексы были вставлены, которые были удалены и которые были перемещены)?

Пример ввода:

let before: [(id: String, timestamp: String)] = [
    ("A", "2015-06-04T12:38:09Z"),
    ("B", "2015-06-04T10:12:45Z"),
    ("C", "2015-06-04T08:39:55Z"),
    ("D", "2015-06-03T23:58:32Z"),
    ("E", "2015-06-01T00:05:51Z"),
]

let after: [(id: String, timestamp: String)] = [
    ("F", "2015-06-04T16:13:01Z"),
    ("C", "2015-06-04T15:10:29Z"),
    ("A", "2015-06-04T12:38:09Z"),
    ("B", "2015-06-04T10:12:45Z"),
]

let delta = deltaFn(before, after)

Здесь выше показано:

BEFORE                                   AFTER
+-------+----+----------------------+    +-------+----+----------------------+
| index | id | timestamp            |    | index | id | timestamp            |
+-------+----+----------------------+    +-------+----+----------------------+
|     0 |  A | 2015-06-04T12:38:09Z |    |     0 |  F | 2015-06-04T16:13:01Z |
|     1 |  B | 2015-06-04T10:12:45Z |    |     1 |  C | 2015-06-04T15:10:29Z |
|     2 |  C | 2015-06-04T08:39:55Z |    |     2 |  A | 2015-06-04T12:38:09Z |
|     3 |  D | 2015-06-03T23:58:32Z |    |     3 |  B | 2015-06-04T10:12:45Z |
|     4 |  E | 2015-06-01T00:05:51Z |    |     - |    |                      |
+-------+----+----------------------+    +-------+----+----------------------+

Ожидаемый результат (дельта):

Inserted indexes:  [0]
Deleted indexes:   [3, 4]
Moved indexes:     [(from: 0, to: 2), (from: 1, to: 3), (from: 2, to: 1)]

Ответ 1

Его можно решить, используя 2 карты, которые отображают из идентификатора каждого элемента в свой индекс и сравнивают их.

Сложность времени - это O (n) для хэш-карт и O (nlogn) для древовидных карт.

Псевдокод:

map1 = empty map
map2 = empty map
for each element x with index i in before:
    map1.insert(x,i)
for each element x with index i in after:
    map2.insert(x,i)

//find moved and deleted:
for each key x in map1:
   id1 = map1.get(x)
   id2 = map2.get(x)
   if id2 == nil:
       add id1 to "deleted indexes"
   else if id1 != id2:
       add (id1,id2) to "moved indexes"
       map2.delete(x)
//find new indexes:
for each key x in map2:
    add map2.get(x) to "inserted indexes"

Изменить: (рекомендуется в комментариях)

Вы можете минимизировать вывод памяти на O(min{m,n}) и время в случае древовидной карты на O(max{m,n}log(min{m,n})), где m,n - это размеры двух списков, путем сопоставления только самого маленького списка, а затем итерации массива (который не был отображен), а не карты.

map = empty map
for each element x with index i in smaller list:
    map.insert(x,i)

for each element x with index i1 in larger list:
   i2 = map.get(x)
   if i2:
       if i1 != i2:
           add (i2, i1) to "moved indexes" if smaller list is before
           add (i1, i2) to "moved indexes" if smaller list is after
       map.delete(x)
   else:
       add i1 to "inserted indexes" if smaller list is before
       add i1 to "deleted indexes" if smaller list is after

// Find new indexes:
for each key x in map:
    add map.get(x) to "deleted indexes" if smaller list is before
    add map.get(x) to "inserted indexes" if smaller list is after

Ответ 2

Возможное решение (похожее на @amit ответ, но используя только один карта):

// A dictionary mapping each id to a pair
//    ( oldIndex, newIndex )
// where oldIndex = -1 for inserted elements
// and newIndex = -1 for deleted elements.
var map : [ String : (from: Int, to: Int)] = [:]

// Add [ id : (from, -1) ] for each id in before:
for (idx, elem) in enumerate(before) {
    map[elem.id] = (from: idx, to: -1)
}

// Update [ id : (from, to) ] or add [ id : (-1, to) ] for each id in after:
for (idx, elem) in enumerate(after) {
    if (map[elem.id]?.to = idx) == nil {
        map[elem.id] = (from: -1, to: idx)
    }
}

var insertedIndices : [Int] = []
var deletedIndices : [Int] = []
var movedIndices : [(from: Int, to: Int)] = []

// Compare from: and to: index for each dictionary value:
for pair in map.values {
    switch pair {
    case (let fromIdx, -1):
        deletedIndices.append(fromIdx)
    case (-1, let toIdx):
        insertedIndices.append(toIdx)
    default:
        movedIndices.append(pair)
    }
}

println(insertedIndices) // [0]
println(deletedIndices)  // [3, 4]
println(movedIndices)    // [(1, 3), (0, 2), (2, 1)]

В качестве альтернативы используйте опции, указывающие на отсутствие старого или нового индекса, как это показано в @doisk:

// A dictionary mapping each id to a pair
//    ( oldIndex, newIndex )
// where oldIndex = nil for inserted elements
// and newIndex = nil for deleted elements.
var map : [ String : (from: Int?, to: Int?)] = [:]

// Add [ id : (from, nil) ] for each id in before:
for (idx, elem) in enumerate(before) {
    map[elem.id] = (from: idx, to: nil)
}

// Update [ id : (from, to) ] or add [ id : (nil, to) ] for each id in after:
for (idx, elem) in enumerate(after) {
    map[elem.id] = (map[elem.id]?.from, idx)
}

// Compare:
var insertedIndices : [Int] = []
var deletedIndices : [Int] = []
var movedIndices : [(from: Int, to: Int)] = []

for pair in map.values {
    switch pair {
    case (let .Some(fromIdx), let .Some(toIdx)):
        movedIndices.append(from: fromIdx, to: toIdx)
    case (let .Some(fromIdx), .None):
        deletedIndices.append(fromIdx)
    case (.None, let .Some(toIdx)):
        insertedIndices.append(toIdx)
    default:
        fatalError("Oops") // This should not happen!
    }
}

Ответ 3

Мое решение не использует функцию карты. Вычислительная сложность - это O (n * m), где n: elms in before и m: elms in after.

И я боюсь, что это не лучшие доступные решения... но вот это:)

import Foundation

// Elm class that contains id and timestamp and is Equatable
class Elm {
    let id : String
    let timestamp : String
    init(tuple : (id:String, timestamp:String)) {
        self.id = tuple.id
        self.timestamp = tuple.timestamp
    }
}
func ==(lhs: Elm, rhs: Elm) -> Bool {
    return lhs.id == rhs.id
}
extension Elm : Equatable {}

// data
let before: [Elm] = [
    Elm(tuple: ("A", "2015-06-04T12:38:09Z")),
    Elm(tuple: ("B", "2015-06-04T10:12:45Z")),
    Elm(tuple: ("C", "2015-06-04T08:39:55Z")),
    Elm(tuple: ("D", "2015-06-03T23:58:32Z")),
    Elm(tuple: ("E", "2015-06-01T00:05:51Z"))
]

let after: [Elm] = [
    Elm(tuple: ("F", "2015-06-04T16:13:01Z")),
    Elm(tuple: ("C", "2015-06-04T15:10:29Z")),
    Elm(tuple: ("A", "2015-06-04T12:38:09Z")),
    Elm(tuple: ("B", "2015-06-04T10:12:45Z"))
]

// O(m * n)
func inserted(before:[Elm], after:[Elm]) -> [Int] {
    var inserted = [Int]()
    for (index, elm) in enumerate(after) {
        if !contains(before, elm) {
            inserted.append(index)
        }
    }
    return inserted
}

// O(n * m)
func deleted(before:[Elm], after:[Elm]) -> [Int] {
    var deleted = [Int]()
    for (index, elm) in enumerate(before) {
        if !contains(after, elm) {
            deleted.append(index)
        }
    }
    return deleted
}

// O(n * m)
func moved(before:[Elm], after:[Elm]) -> [Int:Int] {
    var moved = [Int:Int]()
    for (index, elm) in enumerate(before) {
        if contains(after, elm) && (after[index] != before[index]) {
            moved[index] = find(after, elm)
        }
    }
    return moved
}

inserted(before, after)
deleted(before, after)
moved(before, after)

Ответ 4

Вот что мне удалось:

var map: [String : (bef: Int?, aft: Int?)] = [:]

for (idx, (bef, aft)) in zipWithPadding(before, after).enumerate()
  where bef?.id != aft?.id {
  bef.map{map[$0.id] = (idx, map[$0.id]?.aft)}
  aft.map{map[$0.id] = (map[$0.id]?.bef, idx)}
}

for (val, id) in map {
  switch id {
  case (_, nil):  print("\(val): del at \(id.bef!)")
  case (nil, _):  print("\(val): ins at \(id.aft!)")
  default:        print("\(val): mov from \(id.bef!) to \(id.aft!)")
  }
}

//D: del at 3
//E: del at 4
//F: ins at 0
//B: mov from 1 to 3
//A: mov from 0 to 2
//C: mov from 2 to 1

Этот метод почти такой же, как и другие ответы на карту, за исключением того, что он имеет меньше циклов, и он пропускает значения, одинаковые в каждом массиве. map здесь - словарь строк (id в вашем массиве) и кортежи. Кортежи Int s, соответствующие индексу заданного id в вашем первом массиве, и индекс того же id во втором. Int являются необязательными: так мы выясняем, что произошло с каждым id. Если первое значение равно nil, а второе - нет, тогда вставляется id. Если второе значение равно нулю, оно было удалено. И если оба Int не ноль, то это id было перемещено от первого ко второму.

Способ заполнения карты состоит в том, чтобы перебрать результат функции zipWithPadding, которая находится здесь:

func zipWithPadding <
  S0: SequenceType, S1: SequenceType, E0, E1 where
  S0.Generator.Element == E0, S1.Generator.Element == E1
  > (s0: S0, _ s1: S1) -> AnyGenerator<(E0?, E1?)> {

    var (g0, g1) :
    (S0.Generator?, S1.Generator?) =
    (s0.generate(), s1.generate())

    return anyGenerator {
      let e0: E0? = g0?.next() ?? {g0 = nil; return nil}()
      let e1: E1? = g1?.next() ?? {g1 = nil; return nil}()
      return (e0 != nil || e1 != nil) ? (e0, e1) : nil
    }
}

Я получил его из здесь. Причина, по которой вы не можете использовать стандартную библиотеку zip, заключается в том, что она завершится, как только будет выполнена одна из базовых последовательностей. Здесь, однако, последовательности имеют разную длину. Эта функция zip возвращает генератор кортежей последовательных элементов из двух аргументов последовательности. Если любая последовательность заканчивается перед другой, последующие кортежи будут иметь такое значение последовательности, равное нулю. Вот пример:

Array(zipWithPadding([1, 2, 3], [1, 2]))
//[({Some 1}, {Some 1}), ({Some 2}, {Some 2}), ({Some 3}, nil)]

Поскольку генераторам не гарантировано постоянно возвращать nil после того, как они возвратили nil один раз (генератор возвращает nil, чтобы сигнализировать о его завершении), вы не можете просто продолжать использовать один и тот же генератор для вашего значения кортежа. Вот почему сам генератор установлен в nil, когда он возвращает nil: так что вы больше не вызываете его.

Однако генераторы массива, похоже, возвращают нуль после последнего значения. Итак, если вы не против поведения undefined:

func zipWithPadding <
  S0: SequenceType, S1: SequenceType, E0, E1 where
  S0.Generator.Element == E0, S1.Generator.Element == E1
  > (s0: S0, s1: S1) -> AnyGenerator<(E0?, E1?)> {

    var (g0, g1) = (s0.generate(), s1.generate())

    return anyGenerator {
      let (e0, e1) = (g0.next(), g1.next())
      return e0 != nil || e1 != nil ? (e0, e1) : nil
    }
}

Как только ваш генератор перейдет в цикл, остальная идея будет простой. Идентификаторы до и после помещаются в словарь, и если они уже не были в словаре, соответствующий индекс в кортеже устанавливается равным nil. (map[$0.id]?.aft вернет nil, если $0.id не находится в словаре).

С точки зрения эффективности, есть несколько мест, которые, я думаю, этот подход будет работать. Кажется, что он использует только один цикл, а не два, но пользовательская функция zipWithPadding добавляет столько накладных расходов, что один цикл фактически менее эффективен, чем два последовательных цикла. Аналогично, использование только одного enumerate() кажется эффективным, но опять же, накладные расходы не стоят. (стоит отметить, что если два массива имеют одинаковую длину, стандартная библиотека zip даст вам очень быстрый вариант)

Этот метод позволяет вам пропускать элементы, которые одинаковы в обоих массивах, но что-то, что вы не можете сделать с двумя для циклов. В некоторых быстрых тестах, похоже, что с помощью этого метода более быстро обрабатываются массивы с более чем четвертью их элементов в тех же позициях. Это также ускорит цикл по карте после этого: он будет содержать только различия, поэтому он будет короче, и вам не нужно будет проверять изменения.

UPDATE:

Я пытался выяснить, как избавиться от некоторых накладных расходов, особенно в отношении генератора. Я создал пользовательскую структуру:

struct PaddedZipGenerator<G0: GeneratorType, G1: GeneratorType> : GeneratorType {

  typealias E0 = G0.Element
  typealias E1 = G1.Element

  typealias Element = (E0?, E1?)

  private var (g0, g1): (G0?, G1?)

  mutating func next() -> PaddedZipGenerator.Element? {
    let e0: E0? = g0?.next() ?? {g0 = nil; return nil}()
    let e1: E1? = g1?.next() ?? {g1 = nil; return nil}()
    return (e0 != nil || e1 != nil) ? (e0, e1) : nil
  }
}

struct PaddedZip<S0: SequenceType, S1: SequenceType> : SequenceType {

  typealias Generator = PaddedZipGenerator<S0.Generator, S1.Generator>

  private let (s0, s1): (S0, S1)

  func generate() -> PaddedZip.Generator {
    return PaddedZipGenerator(g0: s0.generate(), g1: s1.generate())
  }
}

func zipWithPadding<S0: SequenceType, S1: SequenceType>(s0: S0, _ s1: S1) -> PaddedZip<S0, S1> {
  return PaddedZip(s0: s0, s1: s1)
}

И похоже, что это сработало! При базовом тестировании кажется, что эта функция zipWithPadding работает довольно быстро. Кажется, он работает быстрее, чем два для циклов, даже если оба списка не содержат ни одного элемента.

Ответ 5

Здесь будет выглядеть как:

func deltaFn(before: [(id: String, timestamp: String)], after: [(id: String, timestamp: String)] ) -> ([Int], [Int], [String]) {

    // Get arrays of just the ids...
    let beforeIds = before.map { $0.id }
    let afterIds = after.map { $0.id }

    // Get the inserted and moved indexes...
    let (inserted, moved) = reduce(0..<afterIds.count, (inserted: [Int](), moved: [String]())) { 

        (var changes, index) -> ([Int], [String]) in

        if let beforeIndex = find(beforeIds, afterIds[index])  {
            if beforeIndex != index {
                changes.moved.append("(from: \(beforeIndex), to: \(index))")
            }
        } else {
            changes.inserted.append(index)
        }
        return changes
    }

    // Get the deleted indexes...
    let deleted = reduce(0..<beforeIds.count, [Int]()) { deleted, index in
        return contains(afterIds, beforeIds[index])
            ? deleted
            : deleted + [index]
    }

    // Return them all as a tuple...
    return (inserted, deleted, moved)
}

let (inserted, deleted, moved) = deltaFn(before, after)

println("Inserted: \(inserted)")  // Inserted: [0]
println("Deleted: \(deleted)")    // Deleted: [3, 4]
println("Moved: \(moved)")        // Moved: [(from: 2, to: 1), (from: 0, to: 2), (from: 1, to: 3)]

Работает так, как ожидалось, и относительно легко на глазах.

Обратите внимание, что синтаксис вызовов reduce будет отличаться, если вы используете Swift 2.0. Например,

reduce(0..<afterIds.count, (inserted: [Int](), moved: [String]()))

становится...

(0..<afterIds.count).reduce((inserted: [Int](), moved: [String]()))