Как ускорить обратное сканирование двоичного файла?

У меня есть спецификация двоичного файла, которая описывает структуру пакетированных данных. Каждый пакет данных имеет двухбайтовый шаблон синхронизации, поэтому возможно сканирование для начала пакета с использованием комбинаций BinaryReader и FileStream:

while(!reader.EndOfFile)
{
    // Check for sync pattern.
    if (reader.ReadUInt16() != 0xEB25)
    {
        // Move to next byte.
        reader.BaseStream.Seek(-1, SeekOrigin.Current);
        continue;
    }

    // If we got here, a sync pattern was found.
}

Этот процесс отлично работает в прямом направлении, но аналогичное сканирование кода в обратном направлении по меньшей мере на два порядка медленнее:

while(!reader.BeginningOfFile)
{
    // Check for sync pattern.
    if (reader.ReadUInt16() != 0xEB25)
    {
        // Move to previous byte.
        reader.BaseStream.Seek(-3, SeekOrigin.Current);
        continue;
    }

    // If we got here, a sync pattern was found.
}

Я пробовал несколько обходных решений, таких как возврат к произвольной сумме (в настоящее время 1 мегабайт) и сканирование вперёд, но становится ясно, что мне действительно нужен BinaryReader или FileStream, который изменен, чтобы адекватные эксплуатационные характеристики при чтении как в прямом, так и в обратном направлениях.

У меня уже есть FastFileStream, который улучшает производительность прямого чтения путем подклассификации обычного FileStream и кэширования свойств Position и Length (он также предоставляет свойства BeginningOfFile и EndOfFile). Это то, что управляет переменной reader в приведенном выше коде.

Есть ли что-то подобное, что я мог бы сделать для улучшения производительности обратного чтения, возможно, включив MemoryStream в качестве буфера?

Ответ 1

L.B, упомянутый в комментарии для использования файла Memory Mapped, может быть впечатлен производительностью.

Попробуйте что-то вроде этого:

var memoryMapName = Path.GetFileName(fileToRead);

using (var mapStream = new FileStream(fileToRead, FileMode.Open))
{
    using (var myMap = MemoryMappedFile.CreateFromFile(
                            mapStream, 
                            memoryMapName, mapStream.Length,
                            MemoryMappedFileAccess.Read, null, 
                            HandleInheritability.None, false))
    {                    
        long leftToRead = mapStream.Length;
        long mapSize = Math.Min(1024 * 1024, mapStream.Length);
        long bytesRead = 0;
        long mapOffset = Math.Max(mapStream.Length - mapSize, 0);

        while (leftToRead > 1)
        {
            using (var FileMap = myMap.CreateViewAccessor(mapOffset, 
                                 mapSize, MemoryMappedFileAccess.Read))
            {
                long readAt = mapSize - 2;
                while (readAt > -1)
                {
                    var int16Read = FileMap.ReadUInt16(readAt);
                    //0xEB25  <--check int16Read here                            
                    bytesRead += 1;
                    readAt -= 1;
                }
            }

            leftToRead = mapStream.Length- bytesRead;
            mapOffset = Math.Max(mapOffset - mapSize, 0);
            mapSize = Math.Min(mapSize, leftToRead);
        }
    }
}

Ответ 2

EDIT: Хорошо, у меня есть код. Ну, довольно много кода. Он позволяет сканировать вперед и назад для заголовков пакетов.

Я не гарантирую, что у него нет ошибок, и вы определенно хотите настроить размер буфера, чтобы увидеть, как он работает... но, учитывая тот же самый файл, который вы мне отправили, он по крайней мере показывает одинаковые позиции заголовков пакетов при сканировании вперед и назад:)

До этого кода я бы предположил, что, если возможно, сканирование через один раз и сохранение индекса информации о пакете для последующего использования, вероятно, будет лучшим подходом.

В любом случае, здесь код (полный без тестов, кроме программы-примера):

PacketHeader.cs:

using System;

namespace Chapter10Reader
{
    public sealed class PacketHeader
    {
        private readonly long filePosition;
        private readonly ushort channelId;
        private readonly uint packetLength;
        private readonly uint dataLength;
        private readonly byte dataTypeVersion;
        private readonly byte sequenceNumber;
        private readonly byte packetFlags;
        private readonly byte dataType;
        private readonly ulong relativeTimeCounter;

        public long FilePosition { get { return filePosition; } }
        public ushort ChannelId { get { return channelId; } }
        public uint PacketLength { get { return packetLength; } }
        public uint DataLength { get { return dataLength; } }
        public byte DataTypeVersion { get { return dataTypeVersion; } }
        public byte SequenceNumber { get { return sequenceNumber; } }
        public byte PacketFlags { get { return packetFlags; } }
        public byte DataType { get { return dataType; } }
        public ulong RelativeTimeCounter { get { return relativeTimeCounter; } }

        public PacketHeader(ushort channelId, uint packetLength, uint dataLength, byte dataTypeVersion,
            byte sequenceNumber, byte packetFlags, byte dataType, ulong relativeTimeCounter, long filePosition)
        {
            this.channelId = channelId;
            this.packetLength = packetLength;
            this.dataLength = dataLength;
            this.dataTypeVersion = dataTypeVersion;
            this.sequenceNumber = sequenceNumber;
            this.packetFlags = packetFlags;
            this.dataType = dataType;
            this.relativeTimeCounter = relativeTimeCounter;
            this.filePosition = filePosition;
        }

        internal static PacketHeader Parse(byte[] data, int index, long filePosition)
        {
            if (index + 24 > data.Length)
            {
                throw new ArgumentException("Packet header must be 24 bytes long; not enough data");
            }
            ushort syncPattern = BitConverter.ToUInt16(data, index + 0);
            if (syncPattern != 0xeb25)
            {
                throw new ArgumentException("Packet header must start with the sync pattern");
            }
            ushort channelId = BitConverter.ToUInt16(data, index + 2);
            uint packetLength = BitConverter.ToUInt32(data, index + 4);
            uint dataLength = BitConverter.ToUInt32(data, index + 8);
            byte dataTypeVersion = data[index + 12];
            byte sequenceNumber = data[index + 13];
            byte packetFlags = data[index + 14];
            byte dataType = data[index + 15];
            // TODO: Validate this...
            ulong relativeTimeCounter =
                (ulong)BitConverter.ToUInt32(data, index + 16) +
                ((ulong)BitConverter.ToUInt16(data, index + 20)) << 32;
            // Assume we've already validated the checksum...
            return new PacketHeader(channelId, packetLength, dataLength, dataTypeVersion, sequenceNumber,
                packetFlags, dataType, relativeTimeCounter, filePosition);
        }

        /// <summary>
        /// Checks a packet header checksum to see whether this *looks* like a packet header.
        /// </summary>
        internal static bool CheckPacketHeaderChecksum(byte[] data, int index)
        {
            if (index + 24 > data.Length)
            {
                throw new ArgumentException("Packet header must is 24 bytes long; not enough data");
            }
            ushort computed = 0;
            for (int i = 0; i < 11; i++)
            {
                computed += BitConverter.ToUInt16(data, index + i * 2);
            }
            return computed == BitConverter.ToUInt16(data, index + 22);
        }
    }
}

PacketScanner.cs:

using System;
using System.Diagnostics;
using System.IO;

namespace Chapter10Reader
{
    public sealed class PacketScanner : IDisposable
    {
        // 128K buffer... tweak this.
        private const int BufferSize = 1024 * 128;

        /// <summary>
        /// Where in the file does the buffer start?
        /// </summary>
        private long bufferStart;

        /// <summary>
        /// Where in the file does the buffer end (exclusive)?
        /// </summary>
        private long bufferEnd;

        /// <summary>
        /// Where are we in the file, logically?
        /// </summary>
        private long logicalPosition;

        // Probably cached by FileStream, but we use it a lot, so let's
        // not risk it...
        private readonly long fileLength;

        private readonly FileStream stream;
        private readonly byte[] buffer = new byte[BufferSize];        

        private PacketScanner(FileStream stream)
        {
            this.stream = stream;
            this.fileLength = stream.Length;
        }

        public void MoveToEnd()
        {
            logicalPosition = fileLength;
            bufferStart = -1; // Invalidate buffer
            bufferEnd = -1;
        }

        public void MoveToBeforeStart()
        {
            logicalPosition = -1;
            bufferStart = -1;
            bufferEnd = -1;
        }

        private byte this[long position]
        {
            get 
            {
                if (position < bufferStart || position >= bufferEnd)
                {
                    FillBuffer(position);
                }
                return buffer[position - bufferStart];
            }
        }

        /// <summary>
        /// Fill the buffer to include the given position.
        /// If the position is earlier than the buffer, assume we're reading backwards
        /// and make position one before the end of the buffer.
        /// If the position is later than the buffer, assume we're reading forwards
        /// and make position the start of the buffer.
        /// If the buffer is invalid, make position the start of the buffer.
        /// </summary>
        private void FillBuffer(long position)
        {
            long newStart;
            if (position > bufferStart)
            {
                newStart = position;
            }
            else
            {
                // Keep position *and position + 1* to avoid swapping back and forth too much
                newStart = Math.Max(0, position - buffer.Length + 2);
            }
            // Make position the start of the buffer.
            int bytesRead;
            int index = 0;
            stream.Position = newStart;
            while ((bytesRead = stream.Read(buffer, index, buffer.Length - index)) > 0)
            {
                index += bytesRead;
            }
            bufferStart = newStart;
            bufferEnd = bufferStart + index;
        }

        /// <summary>
        /// Make sure the buffer contains the given positions.
        /// 
        /// </summary>
        private void FillBuffer(long start, long end)
        {
            if (end - start > buffer.Length)
            {
                throw new ArgumentException("Buffer not big enough!");
            }
            if (end > fileLength)
            {
                throw new ArgumentException("Beyond end of file");
            }
            // Nothing to do.
            if (start >= bufferStart && end < bufferEnd)
            {
                return;
            }
            // TODO: Optimize this more to use whatever bits we've actually got.
            // (We're optimized for "we've got the start, get the end" but not the other way round.)
            if (start >= bufferStart)
            {
                // We've got the start, but not the end. Just shift things enough and read the end...
                int shiftAmount = (int) (end - bufferEnd);
                Buffer.BlockCopy(buffer, shiftAmount, buffer, 0, (int) (bufferEnd - bufferStart - shiftAmount));
                stream.Position = bufferEnd;
                int bytesRead;
                int index = (int)(bufferEnd - bufferStart - shiftAmount);
                while ((bytesRead = stream.Read(buffer, index, buffer.Length - index)) > 0)
                {
                    index += bytesRead;
                }
                bufferStart += shiftAmount;
                bufferEnd = bufferStart + index;
                return;
            }

            // Just fill the buffer starting from start...
            bufferStart = -1;
            bufferEnd = -1;
            FillBuffer(start);
        }

        /// <summary>
        /// Returns the header of the next packet, or null 
        /// if we've reached the end of the file.
        /// </summary>
        public PacketHeader NextHeader()
        {
            for (long tryPosition = logicalPosition + 1; tryPosition < fileLength - 23; tryPosition++)
            {
                if (this[tryPosition] == 0x25 && this[tryPosition + 1] == 0xEB)
                {
                    FillBuffer(tryPosition, tryPosition + 24);
                    int bufferPosition = (int) (tryPosition - bufferStart);
                    if (PacketHeader.CheckPacketHeaderChecksum(buffer, bufferPosition))
                    {
                        logicalPosition = tryPosition;
                        return PacketHeader.Parse(buffer, bufferPosition, tryPosition);
                    }
                }
            }
            logicalPosition = fileLength;
            return null;
        }

        /// <summary>
        /// Returns the header of the previous packet, or null 
        /// if we've reached the start of the file.
        /// </summary>
        public PacketHeader PreviousHeader()
        {
            for (long tryPosition = logicalPosition - 1; tryPosition >= 0; tryPosition--)
            {
                if (this[tryPosition + 1] == 0xEB && this[tryPosition] == 0x25)
                {
                    FillBuffer(tryPosition, tryPosition + 24);
                    int bufferPosition = (int)(tryPosition - bufferStart);
                    if (PacketHeader.CheckPacketHeaderChecksum(buffer, bufferPosition))
                    {
                        logicalPosition = tryPosition;
                        return PacketHeader.Parse(buffer, bufferPosition, tryPosition);
                    }
                }
            }
            logicalPosition = -1;
            return null;
        }

        public static PacketScanner OpenFile(string filename)
        {
            return new PacketScanner(File.OpenRead(filename));
        }

        public void Dispose()
        {
            stream.Dispose();
        }
    }
}

Program.cs(для тестирования):

using System;
using System.Collections.Generic;
using System.Linq;

namespace Chapter10Reader
{
    class Program
    {
        static void Main(string[] args)
        {
            string filename = "test.ch10";

            Console.WriteLine("Forwards:");
            List<long> positionsForward = new List<long>();
            using (PacketScanner scanner = PacketScanner.OpenFile(filename))
            {
                scanner.MoveToBeforeStart();
                PacketHeader header;
                while ((header = scanner.NextHeader()) != null)
                {
                    Console.WriteLine("Found header at {0}", header.FilePosition);
                    positionsForward.Add(header.FilePosition);
                }
            }
            Console.WriteLine();
            Console.WriteLine("Backwards:");
            List<long> positionsBackward = new List<long>();
            using (PacketScanner scanner = PacketScanner.OpenFile(filename))
            {
                scanner.MoveToEnd();
                PacketHeader header;
                while ((header = scanner.PreviousHeader()) != null)
                {
                    positionsBackward.Add(header.FilePosition);
                }
            }
            positionsBackward.Reverse();
            foreach (var position in positionsBackward)
            {
                Console.WriteLine("Found header at {0}", position);
            }

            Console.WriteLine("Same? {0}", positionsForward.SequenceEqual(positionsBackward));
        }
    }
}