C # - 30 segundos
Uma abordagem diferente da maioria, se eu ler direito - não uso nenhuma estrutura baseada em hash.
Tendo a não obter resultados, não tenho certeza se isso é uma anomalia estatística ou erro no meu raciocínio. Corrigida, a comparação para classificação binária era falha.
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Threading;
using System.Threading.Tasks;
namespace FilterFile
{
class Program
{
const int COUNT = 50000000;
static string inputFile = "data" + COUNT + ".txt";
static string outputFile = "results.txt";
static void Main(string[] args)
{
Console.WriteLine("Prepping Test");
if (args.Length > 0) inputFile = args[0];
if (args.Length > 1) outputFile = args[1];
if (!File.Exists(inputFile))
{
Console.WriteLine(inputFile);
File.WriteAllLines(inputFile,
GenerateData(COUNT)
.Select(r => string.Format("{0} {1} {2}", r.A, r.B, r.C)));
}
File.Delete("results.txt");
Console.WriteLine("Starting Test \n\n");
using (Timer.Create("Total Time"))
{
Row[] sortedA, sortedB;
//http://codegolf.stackexchange.com/questions/26643/filter-a-large-file-quickly
using (Timer.Create("Reading Data"))
FillData(out sortedA, out sortedB);
using (Timer.Create("Parallel Sort A"))
ParallelSort.QuicksortParallel(sortedA);
using (Timer.Create("Parallel Sort B"))
ParallelSort.QuicksortParallel(sortedB, (x, y) => x.B - y.B);
object rLock = new object();
List<Row> results = new List<Row>();
var comparison = Comparer<Row>.Create((B, A) => B.B - A.A);
using (Timer.Create("Compute Results"))
Parallel.ForEach(sortedA, row =>
//foreach (var row in sortedA)
{
var i = Array.BinarySearch(sortedB, row, comparison);
if (i < 0) return;
Row other;
bool solved = false;
for (var tempI = i; tempI < sortedB.Length && row.A == (other = sortedB[tempI]).B; tempI++)
{
var diff = row.C - other.C;
if (diff >= 0 && diff < 100)
{
lock (rLock) results.Add(row);
return;
}
}
for (var tempI = i - 1; tempI >= 0 && row.A == (other = sortedB[tempI]).B; tempI--)
{
var diff = row.C - other.C;
if (diff >= 0 && diff < 100)
{
lock (rLock) results.Add(row);
return;
}
}
});
using (Timer.Create("Save Results"))
{
File.WriteAllLines(outputFile, results.Select(r => r.ToString()));
}
}
}
private static void FillData(out Row[] sortedA, out Row[] sortedB)
{
var tempA = new Row[COUNT];
var tempB = tempA;//new Row[COUNT];
const int PARTITION_SIZE = 1 << 22;
ReadAndSort(tempA, tempB, PARTITION_SIZE);
sortedA = tempA;
sortedB = new Row[COUNT];
Array.Copy(sortedA, sortedB, COUNT);
/*using (Timer.Create("MergeA"))
{
int destIndex = 0;
int[][] partitions = Enumerable.Range(0, COUNT / PARTITION_SIZE + 1)
.Select(i => new[] { i * PARTITION_SIZE, Math.Min(i * PARTITION_SIZE + PARTITION_SIZE, COUNT) - 1 })
.ToArray();
for (int i = 0; i < COUNT; i++)
{
foreach (var partition in partitions)
{
while (partition[0] <= partition[1] && tempA[partition[0]].A == i)
{
sortedA[destIndex++] = tempA[partition[0]++];
}
}
}
}*/
/*//Verify Paritioning Works
var results = new List<Tuple<Row, int>> { Tuple.Create(tempA[0], 0) };
for (int i = 1; i < tempA.Length; i++)
{
var r = tempA[i];
if (r.A < tempA[i-1].A)
results.Add(Tuple.Create(r, i % PARTITION_SIZE));
}
results.ForEach(t => Console.WriteLine(t.Item1 + " " + t.Item2));*/
}
private static void ReadAndSort(Row[] tempA, Row[] tempB, int PARTITION_SIZE)
{
List<Task> tasks = new List<Task>();
using (var stream = File.OpenRead(inputFile))
{
int b;
int tempMember = 0;
int memberIndex = 0;
int elementIndex = 0;
using (Timer.Create("Read From Disk"))
while ((b = stream.ReadByte()) >= 0)
{
switch (b)
{
case (byte)'\r':
case (byte)' ':
switch (memberIndex)
{
case 0: tempA[elementIndex].A = tempMember; memberIndex = 1; break;
case 1: tempA[elementIndex].B = tempMember; memberIndex = 2; break;
case 2: tempA[elementIndex].C = tempMember; memberIndex = 0; break;
}
tempMember = 0;
break;
case (byte)'\n':
/*if (elementIndex % PARTITION_SIZE == 0 && elementIndex > 0)
{
var copiedIndex = elementIndex;
tasks.Add(Task.Run(() =>
{
var startIndex = copiedIndex - PARTITION_SIZE;
Array.Copy(tempA, startIndex, tempB, startIndex, PARTITION_SIZE);
ParallelSort.QuicksortSequentialInPlace(tempA, startIndex, copiedIndex - 1);
ParallelSort.QuicksortSequentialInPlace(tempB, startIndex, copiedIndex - 1, (x, y) => x.B - y.B);
}));
}*/
elementIndex++;
break;
default:
tempMember = tempMember * 10 + b - '0';
break;
}
}
/* tasks.Add(Task.Run(() =>
{
elementIndex--; //forget about the last \n
var startIndex = (elementIndex / PARTITION_SIZE) * PARTITION_SIZE;
Array.Copy(tempA, startIndex, tempB, startIndex, elementIndex - startIndex + 1);
ParallelSort.QuicksortParallelInPlace(tempA, startIndex, elementIndex);
ParallelSort.QuicksortSequentialInPlace(tempB, startIndex, elementIndex, (x, y) => x.B - y.B);
}));
using (Timer.Create("WaitForSortingToFinish"))
Task.WaitAll(tasks.ToArray());*/
}
}
static Random rand = new Random();
public struct Row : IComparable<Row>
{
public int A;
public int B;
public int C;
public static Row RandomRow(int count)
{
return new Row { A = rand.Next(count), B = rand.Next(count), C = rand.Next(count) };
}
public int CompareTo(Row other)
{
return A - other.A;
}
public override string ToString()
{
return string.Format("{0} {1} {2}", A, B, C);
}
}
public static Row[] GenerateData(int count)
{
var data = new Row[count];
for (int i = 0; i < count; i++)
data[i] = Row.RandomRow(count);
return data;
}
public static Row[] GenerateSplitData(int count)
{
var data = new Row[count];
for (int i = 0; i < count; i++)
data[i] = Row.RandomRow(count);
return data;
}
public class Timer : IDisposable
{
string message;
Stopwatch sw;
public static Timer Create(string message)
{
Console.WriteLine("Started: " + message);
var t = new Timer();
t.message = message;
t.sw = Stopwatch.StartNew();
return t;
}
public void Dispose()
{
Console.WriteLine("Finished: " + message + " in " + sw.ElapsedMilliseconds + "ms");
}
}
// <summary>
/// Parallel quicksort algorithm.
/// </summary>
public class ParallelSort
{
const int SEQUENTIAL_THRESHOLD = 4096;
#region Public Static Methods
/// <summary>
/// Sequential quicksort.
/// </summary>
/// <typeparam name="T"></typeparam>
/// <param name="arr"></param>
public static void QuicksortSequential<T>(T[] arr) where T : IComparable<T>
{
QuicksortSequentialInPlace(arr, 0, arr.Length - 1);
}
/// <summary>
/// Parallel quicksort
/// </summary>
/// <typeparam name="T"></typeparam>
/// <param name="arr"></param>
public static void QuicksortParallel<T>(T[] arr) where T : IComparable<T>
{
QuicksortParallelInPlace(arr, 0, arr.Length - 1);
}
#endregion
#region Private Static Methods
public static void QuicksortSequentialInPlace<T>(T[] arr, int left, int right)
where T : IComparable<T>
{
if (right > left)
{
int pivot = Partition(arr, left, right);
QuicksortSequentialInPlace(arr, left, pivot - 1);
QuicksortSequentialInPlace(arr, pivot + 1, right);
}
}
public static void QuicksortParallelInPlace<T>(T[] arr, int left, int right)
where T : IComparable<T>
{
if (right > left)
{
if (right - left < SEQUENTIAL_THRESHOLD)
QuicksortSequentialInPlace(arr, left, right);
else
{
int pivot = Partition(arr, left, right);
Parallel.Invoke(() => QuicksortParallelInPlace(arr, left, pivot - 1),
() => QuicksortParallelInPlace(arr, pivot + 1, right));
}
}
}
private static void Swap<T>(T[] arr, int i, int j)
{
T tmp = arr[i];
arr[i] = arr[j];
arr[j] = tmp;
}
private static int Partition<T>(T[] arr, int low, int high)
where T : IComparable<T>
{
// Simple partitioning implementation
int pivotPos = (high + low) / 2;
T pivot = arr[pivotPos];
Swap(arr, low, pivotPos);
int left = low;
for (int i = low + 1; i <= high; i++)
{
if (arr[i].CompareTo(pivot) < 0)
{
left++;
Swap(arr, i, left);
}
}
Swap(arr, low, left);
return left;
}
#endregion
#region Public Static Methods
/// <summary>
/// Sequential quicksort.
/// </summary>
/// <typeparam name="T"></typeparam>
/// <param name="arr"></param>
public static void QuicksortSequential<T>(T[] arr, Func<T, T, int> comparer)
{
QuicksortSequentialInPlace(arr, 0, arr.Length - 1, comparer);
}
/// <summary>
/// Parallel quicksort
/// </summary>
/// <typeparam name="T"></typeparam>
/// <param name="arr"></param>
public static void QuicksortParallel<T>(T[] arr, Func<T, T, int> comparer)
{
QuicksortParallelInPlace(arr, 0, arr.Length - 1, comparer);
}
#endregion
#region Private Static Methods
public static void QuicksortSequentialInPlace<T>(T[] arr, int left, int right, Func<T, T, int> comparer)
{
if (right > left)
{
int pivot = Partition(arr, left, right, comparer);
QuicksortSequentialInPlace(arr, left, pivot - 1, comparer);
QuicksortSequentialInPlace(arr, pivot + 1, right, comparer);
}
}
public static void QuicksortParallelInPlace<T>(T[] arr, int left, int right, Func<T, T, int> comparer)
{
if (right > left)
{
if (right - left < SEQUENTIAL_THRESHOLD)
{
QuicksortSequentialInPlace(arr, left, right, comparer);
}
else
{
int pivot = Partition(arr, left, right, comparer);
Parallel.Invoke(() => QuicksortParallelInPlace(arr, left, pivot - 1, comparer),
() => QuicksortParallelInPlace(arr, pivot + 1, right, comparer));
}
}
}
private static int Partition<T>(T[] arr, int low, int high, Func<T, T, int> comparer)
{
// Simple partitioning implementation
int pivotPos = (high + low) / 2;
T pivot = arr[pivotPos];
Swap(arr, low, pivotPos);
int left = low;
for (int i = low + 1; i <= high; i++)
{
if (comparer(arr[i], pivot) < 0)
{
left++;
Swap(arr, i, left);
}
}
Swap(arr, low, left);
return left;
}
#endregion
}
}
}