This evening I was asked to write a merge algorithm to efficiently merge multiple iterator sources, yielding a merged iterator that would not require the algorithm to read all of the data into memory should the sources be very large. I’ve never written such an algorithm nor can I recall seeing one, so I didn’t have a very good good answer. Of course that left a simmering thread of though on the back burner of my brain.
After letting it rattle around a bit and without resorting to old fashioned Googling, I sat down and banged out the following code. It was fun to write and works but it took me much too long to write from scratch—about 90 minutes. It may be time to refresh and reload, perhaps by writing a series of posts that implement C# versions of selected algorithms found in a book I recently purchased but have since spent no time reading: Introduction to Algorithms 3rd Edition.
Updated Code (9/6/2014)The original code gets a big performance boost with this refactoring:
public static IEnumerable<T> SortedMerge<T>
(params IEnumerable<T>[] sortedSources)
where T : IComparable
{
if (sortedSources == null || sortedSources.Length == 0)
throw new ArgumentNullException("sortedSources");
//1. fetch enumerators for each sourc
var enums = (from n in sortedSources
select n.GetEnumerator()).ToArray();
//2. create index list indicating what MoveNext returned for each enumerator
var enumHasValue = new List<bool>(enums.Length);
// MoveNext on all and initialize enumHasValue
for (int i = 0; i < enums.Length; i++)
{
enumHasValue.Add(enums[i].MoveNext());
}
// if all false, nothing to iterate over
if (enumHasValue.All(x => !x)) yield break;
//3. loop through
while (true)
{
//find index with lowest value
var lowIdx = -1;
T lowVal = default(T);
for (int i = 0; i < enums.Length; i++)
{
if (enumHasValue[i])
{
// must get first before doing any compares
if (lowIdx < 0
|| null == enums[i].Current //null sorts lowest
|| enums[i].Current.CompareTo(lowVal) < 0)
{
lowIdx = i;
lowVal = enums[i].Current;
}
}
}
//if none found, we're done
if (lowIdx < 0) break;
//get next value for enumerator chosen
enumHasValue[lowIdx] = enums[lowIdx].MoveNext();
//yield up the lowest value
yield return lowVal;
}
}
Here’s the original code. I hope you enjoy it. And if you see ways to improve on it, please let me know.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace Merger
{
class Program
{
static void Main(string[] args)
{
int[] a = { 1, 3, 6, 102, 105, 230 };
int[] b = { 101, 103, 112, 155, 231 };
var mm = new MergeMania();
foreach(var val in mm.Merge<int>(a, b))
{
Console.WriteLine(val);
}
Console.ReadLine();
}
}
public class MergeMania
{
public IEnumerable<T> Merge<T>(params IEnumerable<T>[] sortedSources)
where T : IComparable
{
if (sortedSources == null || sortedSources.Length == 0)
throw new ArgumentNullException("sortedSources");
//1. fetch enumerators for each sourc
var enums = (from n in sortedSources
select n.GetEnumerator()).ToArray();
//2. fetch enumerators that have at least one value
var enumsWithValues = (from n in enums
where n.MoveNext()
select n).ToArray();
if (enumsWithValues.Length == 0) yield break; //nothing to iterate over
//3. sort by current value in List<IEnumerator<T>>
var enumsByCurrent = (from n in enumsWithValues
orderby n.Current
select n).ToList();
//4. loop through
while (true)
{
//yield up the lowest value
yield return enumsByCurrent[0].Current;
//move the pointer on the enumerator with that lowest value
if (!enumsByCurrent[0].MoveNext())
{
//remove the first item in the list
enumsByCurrent.RemoveAt(0);
//check for empty
if (enumsByCurrent.Count == 0) break; //we're done
}
enumsByCurrent = enumsByCurrent.OrderBy(x => x.Current).ToList();
}
}
}
}
And if this answers any questions for you, please do drop me a line to let me know.