diff --git a/.gitattributes b/.gitattributes index 45ea6c25e82..8d2fb108ba0 100644 --- a/.gitattributes +++ b/.gitattributes @@ -7,6 +7,8 @@ thirdparty/* linguist-vendored * text=auto eol=lf # Except for bat files, which are Windows only files *.bat eol=crlf +# And some test files where the EOL matters +*.test.txt -text # The above only works properly for Git 2.10+, so for older versions # we need to manually list the binary files we don't want modified. diff --git a/core/core_bind.cpp b/core/core_bind.cpp index 3f94ff8329a..9382e174f1e 100644 --- a/core/core_bind.cpp +++ b/core/core_bind.cpp @@ -1227,13 +1227,13 @@ Vector File::get_buffer(int64_t p_length) const { return data; } -String File::get_as_text() const { +String File::get_as_text(bool p_skip_cr) const { ERR_FAIL_COND_V_MSG(f.is_null(), String(), "File must be opened before use, or is lacking read-write permission."); uint64_t original_pos = f->get_position(); const_cast(*f)->seek(0); - String text = f->get_as_utf8_string(); + String text = f->get_as_utf8_string(p_skip_cr); const_cast(*f)->seek(original_pos); @@ -1430,7 +1430,7 @@ void File::_bind_methods() { ClassDB::bind_method(D_METHOD("get_buffer", "length"), &File::get_buffer); ClassDB::bind_method(D_METHOD("get_line"), &File::get_line); ClassDB::bind_method(D_METHOD("get_csv_line", "delim"), &File::get_csv_line, DEFVAL(",")); - ClassDB::bind_method(D_METHOD("get_as_text"), &File::get_as_text); + ClassDB::bind_method(D_METHOD("get_as_text", "skip_cr"), &File::get_as_text, DEFVAL(false)); ClassDB::bind_method(D_METHOD("get_md5", "path"), &File::get_md5); ClassDB::bind_method(D_METHOD("get_sha256", "path"), &File::get_sha256); ClassDB::bind_method(D_METHOD("is_big_endian"), &File::is_big_endian); diff --git a/core/core_bind.h b/core/core_bind.h index 3a4faa34223..7564bdc7e3a 100644 --- a/core/core_bind.h +++ b/core/core_bind.h @@ -411,7 +411,7 @@ public: Vector get_buffer(int64_t p_length) const; // Get an array of bytes. String get_line() const; Vector get_csv_line(const String &p_delim = ",") const; - String get_as_text() const; + String get_as_text(bool p_skip_cr = false) const; String get_md5(const String &p_path) const; String get_sha256(const String &p_path) const; diff --git a/core/io/file_access.cpp b/core/io/file_access.cpp index da25f23917f..8ed3d40c22d 100644 --- a/core/io/file_access.cpp +++ b/core/io/file_access.cpp @@ -377,7 +377,7 @@ uint64_t FileAccess::get_buffer(uint8_t *p_dst, uint64_t p_length) const { return i; } -String FileAccess::get_as_utf8_string() const { +String FileAccess::get_as_utf8_string(bool p_skip_cr) const { Vector sourcef; uint64_t len = get_length(); sourcef.resize(len + 1); @@ -388,7 +388,7 @@ String FileAccess::get_as_utf8_string() const { w[len] = 0; String s; - s.parse_utf8((const char *)w); + s.parse_utf8((const char *)w, -1, p_skip_cr); return s; } diff --git a/core/io/file_access.h b/core/io/file_access.h index e2c11142d7d..33868006868 100644 --- a/core/io/file_access.h +++ b/core/io/file_access.h @@ -113,7 +113,7 @@ public: virtual String get_line() const; virtual String get_token() const; virtual Vector get_csv_line(const String &p_delim = ",") const; - virtual String get_as_utf8_string() const; + virtual String get_as_utf8_string(bool p_skip_cr = false) const; /** * Use this for files WRITTEN in _big_ endian machines (ie, amiga/mac) diff --git a/core/string/ustring.cpp b/core/string/ustring.cpp index beefe54fafd..e93375bff77 100644 --- a/core/string/ustring.cpp +++ b/core/string/ustring.cpp @@ -1656,7 +1656,7 @@ String String::utf8(const char *p_utf8, int p_len) { return ret; } -Error String::parse_utf8(const char *p_utf8, int p_len) { +Error String::parse_utf8(const char *p_utf8, int p_len, bool p_skip_cr) { if (!p_utf8) { return ERR_INVALID_DATA; } @@ -1689,6 +1689,10 @@ Error String::parse_utf8(const char *p_utf8, int p_len) { uint8_t c = *ptrtmp >= 0 ? *ptrtmp : uint8_t(256 + *ptrtmp); if (skip == 0) { + if (p_skip_cr && c == '\r') { + ptrtmp++; + continue; + } /* Determine the number of characters in sequence */ if ((c & 0x80) == 0) { skip = 0; @@ -1753,6 +1757,10 @@ Error String::parse_utf8(const char *p_utf8, int p_len) { uint8_t c = *p_utf8 >= 0 ? *p_utf8 : uint8_t(256 + *p_utf8); if (skip == 0) { + if (p_skip_cr && c == '\r') { + p_utf8++; + continue; + } /* Determine the number of characters in sequence */ if ((c & 0x80) == 0) { *(dst++) = c; diff --git a/core/string/ustring.h b/core/string/ustring.h index 76726639647..6c3169f1368 100644 --- a/core/string/ustring.h +++ b/core/string/ustring.h @@ -377,7 +377,7 @@ public: CharString ascii(bool p_allow_extended = false) const; CharString utf8() const; - Error parse_utf8(const char *p_utf8, int p_len = -1); + Error parse_utf8(const char *p_utf8, int p_len = -1, bool p_skip_cr = false); static String utf8(const char *p_utf8, int p_len = -1); Char16String utf16() const; diff --git a/doc/classes/File.xml b/doc/classes/File.xml index 0b4a8fa46ee..3a2776ff210 100644 --- a/doc/classes/File.xml +++ b/doc/classes/File.xml @@ -115,9 +115,10 @@ + - Returns the whole file as a [String]. - Text is interpreted as being UTF-8 encoded. + Returns the whole file as a [String]. Text is interpreted as being UTF-8 encoded. + If [code]skip_cr[/code] is [code]true[/code], carriage return characters ([code]\r[/code], CR) will be ignored when parsing the UTF-8, so that only line feed characters ([code]\n[/code], LF) represent a new line (Unix convention). diff --git a/misc/scripts/file_format.sh b/misc/scripts/file_format.sh index c767d3f8a04..731b3ee0051 100755 --- a/misc/scripts/file_format.sh +++ b/misc/scripts/file_format.sh @@ -37,6 +37,8 @@ while IFS= read -rd '' f; do continue elif [[ "$f" == *"-so_wrap."* ]]; then continue + elif [[ "$f" == *".test.txt" ]]; then + continue fi # Ensure that files are UTF-8 formatted. recode UTF-8 "$f" 2> /dev/null diff --git a/platform/android/file_access_filesystem_jandroid.cpp b/platform/android/file_access_filesystem_jandroid.cpp index 733d92f741d..6b21c18d593 100644 --- a/platform/android/file_access_filesystem_jandroid.cpp +++ b/platform/android/file_access_filesystem_jandroid.cpp @@ -29,9 +29,11 @@ /*************************************************************************/ #include "file_access_filesystem_jandroid.h" + #include "core/os/os.h" #include "core/templates/local_vector.h" #include "thread_jandroid.h" + #include jobject FileAccessFilesystemJAndroid::file_access_handler = nullptr; @@ -198,7 +200,7 @@ String FileAccessFilesystemJAndroid::get_line() const { if (elem == '\n' || elem == '\0') { // Found the end of the line const_cast(this)->seek(start_position + line_buffer_position + 1); - if (result.parse_utf8((const char *)line_buffer.ptr(), line_buffer_position)) { + if (result.parse_utf8((const char *)line_buffer.ptr(), line_buffer_position, true)) { return String(); } return result; @@ -206,7 +208,7 @@ String FileAccessFilesystemJAndroid::get_line() const { } } - if (result.parse_utf8((const char *)line_buffer.ptr(), line_buffer_position)) { + if (result.parse_utf8((const char *)line_buffer.ptr(), line_buffer_position, true)) { return String(); } return result; diff --git a/tests/core/io/test_file_access.h b/tests/core/io/test_file_access.h index f0e1cceacf4..aab62955cbb 100644 --- a/tests/core/io/test_file_access.h +++ b/tests/core/io/test_file_access.h @@ -78,6 +78,29 @@ TEST_CASE("[FileAccess] CSV read") { CHECK(row5[1] == "tab separated"); CHECK(row5[2] == "lines, good?"); } + +TEST_CASE("[FileAccess] Get as UTF-8 String") { + Ref f_lf = FileAccess::open(TestUtils::get_data_path("line_endings_lf.test.txt"), FileAccess::READ); + String s_lf = f_lf->get_as_utf8_string(); + f_lf->seek(0); + String s_lf_nocr = f_lf->get_as_utf8_string(true); + CHECK(s_lf == "Hello darkness\nMy old friend\nI've come to talk\nWith you again\n"); + CHECK(s_lf_nocr == "Hello darkness\nMy old friend\nI've come to talk\nWith you again\n"); + + Ref f_crlf = FileAccess::open(TestUtils::get_data_path("line_endings_crlf.test.txt"), FileAccess::READ); + String s_crlf = f_crlf->get_as_utf8_string(); + f_crlf->seek(0); + String s_crlf_nocr = f_crlf->get_as_utf8_string(true); + CHECK(s_crlf == "Hello darkness\r\nMy old friend\r\nI've come to talk\r\nWith you again\r\n"); + CHECK(s_crlf_nocr == "Hello darkness\nMy old friend\nI've come to talk\nWith you again\n"); + + Ref f_cr = FileAccess::open(TestUtils::get_data_path("line_endings_cr.test.txt"), FileAccess::READ); + String s_cr = f_cr->get_as_utf8_string(); + f_cr->seek(0); + String s_cr_nocr = f_cr->get_as_utf8_string(true); + CHECK(s_cr == "Hello darkness\rMy old friend\rI've come to talk\rWith you again\r"); + CHECK(s_cr_nocr == "Hello darknessMy old friendI've come to talkWith you again"); +} } // namespace TestFileAccess #endif // TEST_FILE_ACCESS_H diff --git a/tests/core/string/test_string.h b/tests/core/string/test_string.h index 0c5704d6c97..b8b766023a6 100644 --- a/tests/core/string/test_string.h +++ b/tests/core/string/test_string.h @@ -152,6 +152,20 @@ TEST_CASE("[String] UTF16 with BOM") { CHECK(String::utf16(cs) == s); } +TEST_CASE("[String] UTF8 with CR") { + const String base = U"Hello darkness\r\nMy old friend\nI've come to talk\rWith you again"; + + String keep_cr; + Error err = keep_cr.parse_utf8(base.utf8().get_data()); + CHECK(err == OK); + CHECK(keep_cr == base); + + String no_cr; + err = no_cr.parse_utf8(base.utf8().get_data(), -1, true); // Skip CR. + CHECK(err == OK); + CHECK(no_cr == base.replace("\r", "")); +} + TEST_CASE("[String] Invalid UTF8 (non-standard)") { ERR_PRINT_OFF static const uint8_t u8str[] = { 0x45, 0xE3, 0x81, 0x8A, 0xE3, 0x82, 0x88, 0xE3, 0x81, 0x86, 0xF0, 0x9F, 0x8E, 0xA4, 0xF0, 0x82, 0x82, 0xAC, 0xED, 0xA0, 0x81, 0 }; diff --git a/tests/data/line_endings_cr.test.txt b/tests/data/line_endings_cr.test.txt new file mode 100644 index 00000000000..556154bb250 --- /dev/null +++ b/tests/data/line_endings_cr.test.txt @@ -0,0 +1 @@ +Hello darkness My old friend I've come to talk With you again \ No newline at end of file diff --git a/tests/data/line_endings_crlf.test.txt b/tests/data/line_endings_crlf.test.txt new file mode 100644 index 00000000000..a3cbe55b7f8 --- /dev/null +++ b/tests/data/line_endings_crlf.test.txt @@ -0,0 +1,4 @@ +Hello darkness +My old friend +I've come to talk +With you again diff --git a/tests/data/line_endings_lf.test.txt b/tests/data/line_endings_lf.test.txt new file mode 100644 index 00000000000..0aabcd911e5 --- /dev/null +++ b/tests/data/line_endings_lf.test.txt @@ -0,0 +1,4 @@ +Hello darkness +My old friend +I've come to talk +With you again