Okay as part of my internship I have to deal with a huge number of text strings.
These strings come into the program unsorted and they must be sorted and each one must be unique. (i.e. no duplicates).
Now there are a few different ways to be able to store this data.
A SortedList, SortedDictionary, or two different forms of lists, the first where before each add you check to make sure that the data doesn’t already exist, and the second where you just add then sort then remove duplicates at a later time.
Two work out what one would be best I wrote a program to determine which form of storage operated the fastest on input. The results follow and then the code for how I did it.
As you can see the SortedDictionary worked the best, however at the early stages both the SortedList and Duplicate List gave it a run for it’s money.
Results
Sorted List Test: 00:00:00.1169883 Input: 5000 4000 List Size: 2838 Sorted Dictionary Test: 00:00:00.1339866 Input: 5000 4000 List Size: 2858 Unique List Test: 00:00:00.1119888 Input: 5000 4000 List Size: 2862 Duplicate List Test: 00:00:00.0239976 Input: 5000 4000 List Size: 2832
Sorted List Test: 00:00:01.3768623 Input: 50000 40000 List Size: 28516 Sorted Dictionary Test: 00:00:01.0678932 Input: 50000 40000 List Size: 28466 Unique List Test: 00:00:12.3097689 Input: 50000 40000 List Size: 28384 Duplicate List Test: 00:00:01.2058794 Input: 50000 40000 List Size: 28549
Sorted List Test: 00:01:36.9733017 Input: 500000 400000 List Size: 285367 Sorted Dictionary Test: 00:00:12.3307668 Input: 500000 400000 List Size: 285422 Duplicate List Test: 00:02:32.7467238 Input: 500000 400000 List Size: 285506
Sorted Dictionary Test: 00:02:37.8040000 Input: 5000000 4000000 List Size: 2854095
Code for Program.cs
using System;
using System.Collections.Generic;
using System.Text;
namespace SortTest
{
class Program
{
public static void Main()
{
// 5,000 item list
SortedListTest.run(5000,4000);
SortedDictionaryTest.run(5000, 4000);
UniqueListTest.run(5000, 4000);
DuplicateListTest.run(5000, 4000);
// 50,000 item list
SortedListTest.run(50000, 40000);
SortedDictionaryTest.run(50000, 40000);
UniqueListTest.run(50000, 40000);
DuplicateListTest.run(50000, 40000);
// 500,000 item list
SortedListTest.run(500000, 400000);
SortedDictionaryTest.run(500000, 400000);
//UniqueListTest.run(500000, 400000);
DuplicateListTest.run(500000, 400000);
// 5,000,000 item list
//SortedListTest.run(5000000, 4000000);
SortedDictionaryTest.run(5000000, 4000000);
//UniqueListTest.run(5000000, 4000000);
//DuplicateListTest.run(5000000, 4000000);
}
}
}
Code for SortedListTest.cs
using System;
using System.Collections.Generic;
using System.Text;
namespace SortTest
{
class SortedListTest
{
private static SortedList
public static void run(int loopSize, int multiplier)
{
words = new SortedList
DateTime startTime = DateTime.Now;
Random rand = new Random();
int i;
for (int j = 0; j != loopSize; j++)
{
i = (int)(rand.NextDouble() * multiplier);
try
{
string s = i.ToString();
words.Add(s, s[0]);
}
catch (Exception e)
{
continue;
}
}
DateTime stopTime = DateTime.Now;
TimeSpan duration = stopTime - startTime;
Console.WriteLine("Sorted List Test: " + duration.ToString() + " Input: " + loopSize + " " + multiplier + " List Size: " + words.Count);
}
}
}
Code for SortedDictionaryTest.cs
using System;
using System.Collections.Generic;
using System.Text;
namespace SortTest
{
class SortedDictionaryTest
{
private static SortedDictionary
public static void run(int loopSize, int multiplier)
{
words = new SortedDictionary
DateTime startTime = DateTime.Now;
Random rand = new Random();
int i;
for (int j = 0; j != loopSize; j++)
{
i = (int)(rand.NextDouble() * multiplier);
try
{
string s = i.ToString();
words.Add(s, s[0]);
}
catch (Exception e)
{
continue;
}
}
DateTime stopTime = DateTime.Now;
TimeSpan duration = stopTime - startTime;
Console.WriteLine("Sorted Dictionary Test: " + duration.ToString() + " Input: " + loopSize + " " + multiplier + " List Size: " + words.Count);
}
}
}
Code for UniqueListTest.cs
using System;
using System.Collections.Generic;
using System.Text;
namespace SortTest
{
class UniqueListTest
{
private static List
public static void run(int loopSize, int multiplier)
{
words = new List
DateTime startTime = DateTime.Now;
Random rand = new Random();
int i;
for (int j = 0; j != loopSize; j++)
{
i = (int)(rand.NextDouble() * multiplier);
try
{
string s = i.ToString();
bool matchfound = false;
foreach (string w in words) // ignore duplicates
{
if (w == s)
{
matchfound = true;
break;
}
}
if (!matchfound) words.Add(s);
}
catch (Exception e)
{
continue;
}
}
words.Sort();
DateTime stopTime = DateTime.Now;
TimeSpan duration = stopTime - startTime;
Console.WriteLine("Unique List Test: " + duration.ToString() + " Input: " + loopSize + " " + multiplier + " List Size: " + words.Count);
}
}
}
Code for DuplicateListTest.cs
using System;
using System.Collections.Generic;
using System.Text;
namespace SortTest
{
class DuplicateListTest
{
private static List
public static void run(int loopSize, int multiplier)
{
words = new List
DateTime startTime = DateTime.Now;
Random rand = new Random();
int i;
for (int j = 0; j != loopSize; j++)
{
i = (int)(rand.NextDouble() * multiplier);
try
{
string s = i.ToString();
words.Add(s);
}
catch (Exception e)
{
continue;
}
}
words.Sort();
string prev = null;
for(int counter = 0; counter < words.Count; counter++)
{
if (prev == words[counter])
{
words.RemoveAt(counter);
counter--;
}
prev = words[counter];
}
DateTime stopTime = DateTime.Now;
TimeSpan duration = stopTime - startTime;
Console.WriteLine("Duplicate List Test: " + duration.ToString() + " Input: " + loopSize + " " + multiplier + " List Size: " + words.Count);
}
}
}
Pingback: What’s Happening? » Blog Archive » Java Programming Help. - Egypt Forums