Преобразовать строку (UTF-16) в UTF-8 в С#

Мне нужно преобразовать строку в UTF-8 в С#. Я уже много раз пробовал, но никто не работает, как мне хотелось. Я преобразовал свою строку в массив байтов, а затем попытался записать ее в файл XML (какая кодировка - UTF-8....), но либо я получил ту же строку (не закодированную вообще), либо у меня есть список байт, который бесполезен.... Кто-то сталкивается с той же проблемой?

Изменить: Это часть кода, который я использовал:

str= "testé";
byte[] utf8Bytes = Encoding.UTF8.GetBytes(str);
return Encoding.UTF8.GetString(utf8Bytes);

В результате "testé" или я ожидал чего-то вроде "testÃ ©"...

Ответ 1

Если вы хотите строку UTF8, где каждый байт правильный ('Ö' → [195, 0], [150, 0]), вы можете использовать следующее:

public static string Utf16ToUtf8(string utf16String)
{
   /**************************************************************
    * Every .NET string will store text with the UTF16 encoding, *
    * known as Encoding.Unicode. Other encodings may exist as    *
    * Byte-Array or incorrectly stored with the UTF16 encoding.  *
    *                                                            *
    * UTF8 = 1 bytes per char                                    *
    *    ["100" for the ansi 'd']                                *
    *    ["206" and "186" for the russian 'κ']                   *
    *                                                            *
    * UTF16 = 2 bytes per char                                   *
    *    ["100, 0" for the ansi 'd']                             *
    *    ["186, 3" for the russian 'κ']                          *
    *                                                            *
    * UTF8 inside UTF16                                          *
    *    ["100, 0" for the ansi 'd']                             *
    *    ["206, 0" and "186, 0" for the russian 'κ']             *
    *                                                            *
    * We can use the convert encoding function to convert an     *
    * UTF16 Byte-Array to an UTF8 Byte-Array. When we use UTF8   *
    * encoding to string method now, we will get a UTF16 string. *
    *                                                            *
    * So we imitate UTF16 by filling the second byte of a char   *
    * with a 0 byte (binary 0) while creating the string.        *
    **************************************************************/

    // Storage for the UTF8 string
    string utf8String = String.Empty;

    // Get UTF16 bytes and convert UTF16 bytes to UTF8 bytes
    byte[] utf16Bytes = Encoding.Unicode.GetBytes(utf16String);
    byte[] utf8Bytes = Encoding.Convert(Encoding.Unicode, Encoding.UTF8, utf16Bytes);

    // Fill UTF8 bytes inside UTF8 string
    for (int i = 0; i < utf8Bytes.Length; i++)
    {
        // Because char always saves 2 bytes, fill char with 0
        byte[] utf8Container = new byte[2] { utf8Bytes[i], 0 };
        utf8String += BitConverter.ToChar(utf8Container, 0);
    }

    // Return UTF8
    return utf8String;
}

В моем случае DLL-запрос также является строкой UTF8, но, к сожалению, строка UTF8 должна интерпретироваться с кодировкой UTF16 ('Ö' → [195, 0], [19, 32]). Таким образом, ANSI '-', который равен 150, должен быть преобразован в UTF16 '-', который равен 8211. Если у вас есть этот случай, вы можете вместо этого использовать следующее:

public static string Utf16ToUtf8(string utf16String)
{
    // Get UTF16 bytes and convert UTF16 bytes to UTF8 bytes
    byte[] utf16Bytes = Encoding.Unicode.GetBytes(utf16String);
    byte[] utf8Bytes = Encoding.Convert(Encoding.Unicode, Encoding.UTF8, utf16Bytes);

    // Return UTF8 bytes as ANSI string
    return Encoding.Default.GetString(utf8Bytes);
}

Или собственный метод:

[DllImport("kernel32.dll")]
private static extern Int32 WideCharToMultiByte(UInt32 CodePage, UInt32 dwFlags, [MarshalAs(UnmanagedType.LPWStr)] String lpWideCharStr, Int32 cchWideChar, [Out, MarshalAs(UnmanagedType.LPStr)] StringBuilder lpMultiByteStr, Int32 cbMultiByte, IntPtr lpDefaultChar, IntPtr lpUsedDefaultChar);

public static string Utf16ToUtf8(string utf16String)
{
    Int32 iNewDataLen = WideCharToMultiByte(Convert.ToUInt32(Encoding.UTF8.CodePage), 0, utf16String, utf16String.Length, null, 0, IntPtr.Zero, IntPtr.Zero);
    if (iNewDataLen > 1)
    {
        StringBuilder utf8String = new StringBuilder(iNewDataLen);
        WideCharToMultiByte(Convert.ToUInt32(Encoding.UTF8.CodePage), 0, utf16String, -1, utf8String, utf8String.Capacity, IntPtr.Zero, IntPtr.Zero);

        return utf8String.ToString();
    }
    else
    {
        return String.Empty;
    }
}

Если вам это нужно наоборот, см. Utf8ToUtf16. Надеюсь, я справлюсь.

Ответ 2

Строка в С# всегда является UTF-16, нет возможности ее "конвертировать". Кодирование не имеет значения до тех пор, пока вы манипулируете строкой в памяти, это имеет значение только при записи строки в поток (файл, поток памяти, сетевой поток...).

Если вы хотите записать строку в файл XML, просто укажите кодировку при создании XmlWriter

Ответ 3

    private static string Utf16ToUtf8(string utf16String)
    {
        /**************************************************************
         * Every .NET string will store text with the UTF16 encoding, *
         * known as Encoding.Unicode. Other encodings may exist as    *
         * Byte-Array or incorrectly stored with the UTF16 encoding.  *
         *                                                            *
         * UTF8 = 1 bytes per char                                    *
         *    ["100" for the ansi 'd']                                *
         *    ["206" and "186" for the russian '?']                   *
         *                                                            *
         * UTF16 = 2 bytes per char                                   *
         *    ["100, 0" for the ansi 'd']                             *
         *    ["186, 3" for the russian '?']                          *
         *                                                            *
         * UTF8 inside UTF16                                          *
         *    ["100, 0" for the ansi 'd']                             *
         *    ["206, 0" and "186, 0" for the russian '?']             *
         *                                                            *
         * We can use the convert encoding function to convert an     *
         * UTF16 Byte-Array to an UTF8 Byte-Array. When we use UTF8   *
         * encoding to string method now, we will get a UTF16 string. *
         *                                                            *
         * So we imitate UTF16 by filling the second byte of a char   *
         * with a 0 byte (binary 0) while creating the string.        *
         **************************************************************/

        // Get UTF16 bytes and convert UTF16 bytes to UTF8 bytes
        byte[] utf16Bytes = Encoding.Unicode.GetBytes(utf16String);
        byte[] utf8Bytes = Encoding.Convert(Encoding.Unicode, Encoding.UTF8, utf16Bytes);
        char[] chars = (char[])Array.CreateInstance(typeof(char), utf8Bytes.Length);

        for (int i = 0; i < utf8Bytes.Length; i++)
        {
            chars[i] = BitConverter.ToChar(new byte[2] { utf8Bytes[i], 0 }, 0);
        }

        // Return UTF8
        return new String(chars);
    }

В исходной статье автор конкатенированных строк. Каждая операция скрининга приведет к восстановлению последовательности в .Net. Строка является ссылочным типом. В результате предусмотренная функция будет заметно медленной. Не делай этого. Вместо этого используйте массив символов, напишите там, а затем преобразуйте результат в строку. В моем случае обработка 500 kb текстовой разницы составляет почти 5 минут.

Ответ 4

Отметьте ответ Jon Skeet на этот другой вопрос: Преобразование UTF-16 в UTF-8 (для сценариев в Windows)

Он содержит исходный код, который вам нужен.

Надеюсь, что это поможет.

Ответ 5

Помогает ли этот пример?

using System;
using System.IO;
using System.Text;

class Test
{
   public static void Main() 
   {        
    using (StreamWriter output = new StreamWriter("practice.txt")) 
    {
        // Create and write a string containing the symbol for Pi.
        string srcString = "Area = \u03A0r^2";

        // Convert the UTF-16 encoded source string to UTF-8 and ASCII.
        byte[] utf8String = Encoding.UTF8.GetBytes(srcString);
        byte[] asciiString = Encoding.ASCII.GetBytes(srcString);

        // Write the UTF-8 and ASCII encoded byte arrays. 
        output.WriteLine("UTF-8  Bytes: {0}", BitConverter.ToString(utf8String));
        output.WriteLine("ASCII  Bytes: {0}", BitConverter.ToString(asciiString));


        // Convert UTF-8 and ASCII encoded bytes back to UTF-16 encoded  
        // string and write.
        output.WriteLine("UTF-8  Text : {0}", Encoding.UTF8.GetString(utf8String));
        output.WriteLine("ASCII  Text : {0}", Encoding.ASCII.GetString(asciiString));

        Console.WriteLine(Encoding.UTF8.GetString(utf8String));
        Console.WriteLine(Encoding.ASCII.GetString(asciiString));
    }
}

}

Ответ 6

class Program
{
    static void Main(string[] args)
    {
        String unicodeString =
        "This Unicode string contains two characters " +
        "with codes outside the traditional ASCII code range, " +
        "Pi (\u03a0) and Sigma (\u03a3).";

        Console.WriteLine("Original string:");
        Console.WriteLine(unicodeString);
        UnicodeEncoding unicodeEncoding = new UnicodeEncoding();
        byte[] utf16Bytes = unicodeEncoding.GetBytes(unicodeString);
        char[] chars = unicodeEncoding.GetChars(utf16Bytes, 2, utf16Bytes.Length - 2);
        string s = new string(chars);
        Console.WriteLine();
        Console.WriteLine("Char Array:");
        foreach (char c in chars) Console.Write(c);
        Console.WriteLine();
        Console.WriteLine();
        Console.WriteLine("String from Char Array:");
        Console.WriteLine(s);

        Console.ReadKey();
    }
}