From faf78b70fb43a041b8812a4f47571fd0b915c604 Mon Sep 17 00:00:00 2001
From: Jahnvi Thakkar <jathakkar@microsoft.com>
Date: Mon, 27 Apr 2026 16:03:17 +0530
Subject: [PATCH 1/2] fix: fetch VARCHAR UTF-8 collation columns as SQL_C_WCHAR
 on Windows to prevent lossy ACP conversion

On Windows, the ODBC Driver Manager converts SQL_C_CHAR data to the system's
ANSI code page (typically CP1252) before delivering it to the application.
This is lossy for characters outside CP1252: CJK/Emoji get replaced with '?'
(irreversible data loss) and extended Latin characters arrive as CP1252 bytes
that fail UTF-8 decoding (returned as raw bytes instead of str).

Fix: When charEncoding is 'utf-8' on Windows, fetch VARCHAR/CHAR/LONGVARCHAR
columns as SQL_C_WCHAR (UTF-16LE) instead of SQL_C_CHAR. The ODBC driver
converts losslessly to UTF-16LE, bypassing the lossy ACP conversion entirely.

Changes in ddbc_bindings.cpp:
- Add ShouldFetchCharAsWChar() helper (Windows-only, UTF-8 only)
- SQLGetData_wrap (fetchone path): fetch VARCHAR as SQL_C_WCHAR when active,
  decode via PyUnicode_FromWideChar, with LOB streaming fallback
- SQLBindColums (batch path): accept charEncoding param, bind VARCHAR into
  wcharBuffers as SQL_C_WCHAR when active
- FetchBatchData: route VARCHAR columns to ProcessWChar dispatcher when active,
  compute correct fetchBufferSize for WCHAR buffers
- FetchMany_wrap/FetchAll_wrap: pass charEncoding to SQLBindColums
- Arrow path: unchanged (uses default empty charEncoding, no WCHAR workaround)

Not affected:
- Linux/macOS (ShouldFetchCharAsWChar always returns false)
- Non-UTF-8 encodings (cp1252, latin-1, gbk, etc. use old SQL_C_CHAR path)
- NVARCHAR columns (already use SQL_C_WCHAR)
- setencoding API (write path, unrelated to fetch)

Test: add test_varchar_utf8_collation_unicode_roundtrip covering ASCII, German,
Chinese, Japanese, Russian, Greek, Arabic, Emoji, French through fetchone,
fetchall, and fetchmany paths.
---
 mssql_python/pybind/ddbc_bindings.cpp | 208 +++++++++++++++++++++-----
 tests/test_013_encoding_decoding.py   | 117 +++++++++++++++
 2 files changed, 286 insertions(+), 39 deletions(-)

diff --git a/mssql_python/pybind/ddbc_bindings.cpp b/mssql_python/pybind/ddbc_bindings.cpp
index 47a2a2554..6cc645acf 100644
--- a/mssql_python/pybind/ddbc_bindings.cpp
+++ b/mssql_python/pybind/ddbc_bindings.cpp
@@ -68,6 +68,26 @@ inline std::string GetEffectiveCharDecoding(const std::string& userEncoding) {
 #endif
 }
 
+// Returns true if VARCHAR columns should be fetched as SQL_C_WCHAR (UTF-16LE)
+// instead of SQL_C_CHAR to avoid the lossy ACP conversion on Windows.
+//
+// On Windows, the ODBC driver converts SQL_C_CHAR data from the server's encoding
+// to the system's ANSI code page (e.g., CP1252). This is lossy for characters
+// outside the ACP range. When the user requests UTF-8 decoding for SQL_CHAR,
+// we fetch as SQL_C_WCHAR (UTF-16LE) which the ODBC driver converts losslessly,
+// then decode from UTF-16LE to Python str.
+//
+// On Linux/macOS, the ODBC driver already returns UTF-8 for SQL_C_CHAR based
+// on the system locale, so this workaround is not needed.
+inline bool ShouldFetchCharAsWChar(const std::string& charEncoding) {
+#if defined(_WIN32)
+    return charEncoding == "utf-8" || charEncoding == "UTF-8" || charEncoding == "utf8";
+#else
+    (void)charEncoding;
+    return false;
+#endif
+}
+
 namespace PythonObjectCache {
 py::object get_time_class();
 }
@@ -3210,11 +3230,88 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p
             case SQL_LONGVARCHAR: {
                 if (columnSize == SQL_NO_TOTAL || columnSize == 0 ||
                     columnSize > SQL_MAX_LOB_SIZE) {
-                    LOG("SQLGetData: Streaming LOB for column %d (SQL_C_CHAR) "
-                        "- columnSize=%lu",
-                        i, (unsigned long)columnSize);
-                    row.append(
-                        FetchLobColumnData(hStmt, i, SQL_C_CHAR, false, false, charEncoding));
+                    // LOB path: stream the data
+                    if (ShouldFetchCharAsWChar(charEncoding)) {
+                        // On Windows with UTF-8, fetch LOB VARCHAR as WCHAR to avoid
+                        // lossy ACP conversion
+                        LOG("SQLGetData: Streaming LOB for column %d (SQL_C_WCHAR via "
+                            "UTF-8 workaround) - columnSize=%lu",
+                            i, (unsigned long)columnSize);
+                        row.append(
+                            FetchLobColumnData(hStmt, i, SQL_C_WCHAR, true, false, charEncoding));
+                    } else {
+                        LOG("SQLGetData: Streaming LOB for column %d (SQL_C_CHAR) "
+                            "- columnSize=%lu",
+                            i, (unsigned long)columnSize);
+                        row.append(
+                            FetchLobColumnData(hStmt, i, SQL_C_CHAR, false, false, charEncoding));
+                    }
+                } else if (ShouldFetchCharAsWChar(charEncoding)) {
+                    // On Windows with UTF-8 decoding: fetch VARCHAR as SQL_C_WCHAR
+                    // to bypass the ODBC driver's lossy ACP (e.g. CP1252) conversion.
+                    // The ODBC driver converts losslessly to UTF-16LE for SQL_C_WCHAR.
+                    uint64_t wcharBufSize = (columnSize + 1);  // in SQLWCHAR units
+                    std::vector<SQLWCHAR> wdataBuffer(wcharBufSize);
+                    SQLLEN dataLen;
+                    ret = SQLGetData_ptr(hStmt, i, SQL_C_WCHAR, wdataBuffer.data(),
+                                         wcharBufSize * sizeof(SQLWCHAR), &dataLen);
+                    if (SQL_SUCCEEDED(ret)) {
+                        if (dataLen > 0) {
+                            uint64_t numCharsInData = dataLen / sizeof(SQLWCHAR);
+                            if (numCharsInData <= columnSize) {
+#if defined(_WIN32)
+                                PyObject* pyStr = PyUnicode_FromWideChar(
+                                    reinterpret_cast<wchar_t*>(wdataBuffer.data()),
+                                    numCharsInData);
+#else
+                                PyObject* pyStr = PyUnicode_DecodeUTF16(
+                                    reinterpret_cast<const char*>(wdataBuffer.data()),
+                                    numCharsInData * sizeof(SQLWCHAR), NULL, NULL);
+#endif
+                                if (pyStr) {
+                                    row.append(py::reinterpret_steal<py::object>(pyStr));
+                                    LOG("SQLGetData: CHAR column %d fetched as WCHAR (UTF-8 "
+                                        "workaround), %zu bytes -> decoded",
+                                        i, (size_t)dataLen);
+                                } else {
+                                    PyErr_Clear();
+                                    LOG_ERROR("SQLGetData: Failed to decode WCHAR data for "
+                                              "CHAR column %d",
+                                              i);
+                                    row.append(py::none());
+                                }
+                            } else {
+                                // Buffer too small, fallback to LOB streaming
+                                LOG("SQLGetData: CHAR column %d WCHAR data truncated, "
+                                    "using streaming LOB",
+                                    i);
+                                row.append(FetchLobColumnData(hStmt, i, SQL_C_WCHAR, true, false,
+                                                              charEncoding));
+                            }
+                        } else if (dataLen == SQL_NULL_DATA) {
+                            LOG("SQLGetData: Column %d is NULL (CHAR via WCHAR)", i);
+                            row.append(py::none());
+                        } else if (dataLen == 0) {
+                            row.append(py::str(""));
+                        } else if (dataLen == SQL_NO_TOTAL) {
+                            LOG("SQLGetData: SQL_NO_TOTAL for column %d (CHAR via WCHAR), "
+                                "falling back to LOB",
+                                i);
+                            row.append(FetchLobColumnData(hStmt, i, SQL_C_WCHAR, true, false,
+                                                          charEncoding));
+                        } else if (dataLen < 0) {
+                            LOG("SQLGetData: Unexpected negative data length "
+                                "for column %d (CHAR via WCHAR) - dataLen=%ld",
+                                i, (long)dataLen);
+                            ThrowStdException("SQLGetData returned an unexpected negative "
+                                              "data length");
+                        }
+                    } else {
+                        LOG("SQLGetData: Error retrieving WCHAR data for CHAR column %d "
+                            "- SQLRETURN=%d, returning NULL",
+                            i, ret);
+                        row.append(py::none());
+                    }
                 } else {
                     // Allocate columnSize * 4 + 1 on ALL platforms (no #if guard).
                     //
@@ -3731,9 +3828,13 @@ SQLRETURN SQLFetchScroll_wrap(SqlHandlePtr StatementHandle, SQLSMALLINT FetchOri
 
 // For column in the result set, binds a buffer to retrieve column data
 // TODO: Move to anonymous namespace, since it is not used outside this file
+// charEncoding default is "" so callers that don't pass it (e.g. Arrow path)
+// will NOT trigger the WCHAR workaround for VARCHAR columns.
 SQLRETURN SQLBindColums(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& columnNames,
-                        SQLUSMALLINT numCols, int fetchSize) {
+                        SQLUSMALLINT numCols, int fetchSize,
+                        const std::string& charEncoding = "") {
     SQLRETURN ret = SQL_SUCCESS;
+    const bool fetchCharAsWChar = ShouldFetchCharAsWChar(charEncoding);
     // Bind columns based on their data types
     for (SQLUSMALLINT col = 1; col <= numCols; col++) {
         auto columnMeta = columnNames[col - 1].cast<py::dict>();
@@ -3747,29 +3848,41 @@ SQLRETURN SQLBindColums(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& column
                 // TODO: handle variable length data correctly. This logic wont
                 // suffice
                 HandleZeroColumnSizeAtFetch(columnSize);
-                // Use columnSize * 4 + 1 on Linux/macOS to accommodate UTF-8
-                // expansion. The ODBC driver returns UTF-8 for SQL_C_CHAR where
-                // each character can be up to 4 bytes.
+                if (fetchCharAsWChar) {
+                    // On Windows with UTF-8: bind VARCHAR as SQL_C_WCHAR to
+                    // bypass the ODBC driver's lossy ACP conversion.
+                    uint64_t fetchBufferSize = columnSize + 1 /*null-terminator*/;
+                    buffers.wcharBuffers[col - 1].resize(fetchSize * fetchBufferSize);
+                    ret = SQLBindCol_ptr(hStmt, col, SQL_C_WCHAR,
+                                         buffers.wcharBuffers[col - 1].data(),
+                                         fetchBufferSize * sizeof(SQLWCHAR),
+                                         buffers.indicators[col - 1].data());
+                } else {
+                    // Use columnSize * 4 + 1 on Linux/macOS to accommodate UTF-8
+                    // expansion. The ODBC driver returns UTF-8 for SQL_C_CHAR where
+                    // each character can be up to 4 bytes.
 #if defined(__APPLE__) || defined(__linux__)
-                uint64_t fetchBufferSize = columnSize * 4 + 1 /*null-terminator*/;
+                    uint64_t fetchBufferSize = columnSize * 4 + 1 /*null-terminator*/;
 #else
-                uint64_t fetchBufferSize = columnSize + 1 /*null-terminator*/;
+                    uint64_t fetchBufferSize = columnSize + 1 /*null-terminator*/;
 #endif
-                // TODO: For LONGVARCHAR/BINARY types, columnSize is returned as
-                // 2GB-1 by SQLDescribeCol. So fetchBufferSize = 2GB.
-                // fetchSize=1 if columnSize>1GB. So we'll allocate a vector of
-                // size 2GB. If a query fetches multiple (say N) LONG...
-                // columns, we will have allocated multiple (N) 2GB sized
-                // vectors. This will make driver very slow. And if the N is
-                // high enough, we could hit the OS limit for heap memory that
-                // we can allocate, & hence get a std::bad_alloc. The process
-                // could also be killed by OS for consuming too much memory.
-                // Hence this will be revisited in beta to not allocate 2GB+
-                // memory, & use streaming instead
-                buffers.charBuffers[col - 1].resize(fetchSize * fetchBufferSize);
-                ret = SQLBindCol_ptr(hStmt, col, SQL_C_CHAR, buffers.charBuffers[col - 1].data(),
-                                     fetchBufferSize * sizeof(SQLCHAR),
-                                     buffers.indicators[col - 1].data());
+                    // TODO: For LONGVARCHAR/BINARY types, columnSize is returned as
+                    // 2GB-1 by SQLDescribeCol. So fetchBufferSize = 2GB.
+                    // fetchSize=1 if columnSize>1GB. So we'll allocate a vector of
+                    // size 2GB. If a query fetches multiple (say N) LONG...
+                    // columns, we will have allocated multiple (N) 2GB sized
+                    // vectors. This will make driver very slow. And if the N is
+                    // high enough, we could hit the OS limit for heap memory that
+                    // we can allocate, & hence get a std::bad_alloc. The process
+                    // could also be killed by OS for consuming too much memory.
+                    // Hence this will be revisited in beta to not allocate 2GB+
+                    // memory, & use streaming instead
+                    buffers.charBuffers[col - 1].resize(fetchSize * fetchBufferSize);
+                    ret = SQLBindCol_ptr(hStmt, col, SQL_C_CHAR,
+                                         buffers.charBuffers[col - 1].data(),
+                                         fetchBufferSize * sizeof(SQLCHAR),
+                                         buffers.indicators[col - 1].data());
+                }
                 break;
             }
             case SQL_WCHAR:
@@ -3923,6 +4036,7 @@ SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& colum
         bool isLob;
     };
     std::vector<ColumnInfo> columnInfos(numCols);
+    const bool fetchCharAsWChar = ShouldFetchCharAsWChar(charEncoding);
     for (SQLUSMALLINT col = 0; col < numCols; col++) {
         const auto& columnMeta = columnNames[col].cast<py::dict>();
         columnInfos[col].dataType = columnMeta["DataType"].cast<SQLSMALLINT>();
@@ -3931,22 +4045,31 @@ SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& colum
             std::find(lobColumns.begin(), lobColumns.end(), col + 1) != lobColumns.end();
         columnInfos[col].processedColumnSize = columnInfos[col].columnSize;
         HandleZeroColumnSizeAtFetch(columnInfos[col].processedColumnSize);
-        // On Linux/macOS, the ODBC driver returns UTF-8 for SQL_C_CHAR where
-        // each character can be up to 4 bytes. Must match SQLBindColums buffer.
-#if defined(__APPLE__) || defined(__linux__)
+
         SQLSMALLINT dt = columnInfos[col].dataType;
         bool isCharType = (dt == SQL_CHAR || dt == SQL_VARCHAR || dt == SQL_LONGVARCHAR);
-        if (isCharType) {
-            columnInfos[col].fetchBufferSize = columnInfos[col].processedColumnSize * 4 +
-                                               1;  // *4 for UTF-8, +1 for null terminator
-        } else {
+
+        if (fetchCharAsWChar && isCharType) {
+            // When fetching VARCHAR as WCHAR (UTF-8 workaround on Windows),
+            // fetchBufferSize is in SQLWCHAR units to match SQLBindColums
             columnInfos[col].fetchBufferSize =
                 columnInfos[col].processedColumnSize + 1;  // +1 for null terminator
-        }
+        } else {
+            // On Linux/macOS, the ODBC driver returns UTF-8 for SQL_C_CHAR where
+            // each character can be up to 4 bytes. Must match SQLBindColums buffer.
+#if defined(__APPLE__) || defined(__linux__)
+            if (isCharType) {
+                columnInfos[col].fetchBufferSize = columnInfos[col].processedColumnSize * 4 +
+                                                   1;  // *4 for UTF-8, +1 for null terminator
+            } else {
+                columnInfos[col].fetchBufferSize =
+                    columnInfos[col].processedColumnSize + 1;  // +1 for null terminator
+            }
 #else
-        columnInfos[col].fetchBufferSize =
-            columnInfos[col].processedColumnSize + 1;  // +1 for null terminator
+            columnInfos[col].fetchBufferSize =
+                columnInfos[col].processedColumnSize + 1;  // +1 for null terminator
 #endif
+        }
     }
 
     // Performance: Build function pointer dispatch table (once per batch)
@@ -3998,7 +4121,13 @@ SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& colum
             case SQL_CHAR:
             case SQL_VARCHAR:
             case SQL_LONGVARCHAR:
-                columnProcessors[col] = ColumnProcessors::ProcessChar;
+                // When fetchCharAsWChar is active, VARCHAR data is in wcharBuffers
+                // (bound as SQL_C_WCHAR) so use the WCHAR processor for decoding.
+                if (fetchCharAsWChar) {
+                    columnProcessors[col] = ColumnProcessors::ProcessWChar;
+                } else {
+                    columnProcessors[col] = ColumnProcessors::ProcessChar;
+                }
                 break;
             case SQL_WCHAR:
             case SQL_WVARCHAR:
@@ -4397,7 +4526,7 @@ SQLRETURN FetchMany_wrap(SqlHandlePtr StatementHandle, py::list& rows, int fetch
     ColumnBuffers buffers(numCols, fetchSize);
 
     // Bind columns
-    ret = SQLBindColums(hStmt, buffers, columnNames, numCols, fetchSize);
+    ret = SQLBindColums(hStmt, buffers, columnNames, numCols, fetchSize, charEncoding);
     if (!SQL_SUCCEEDED(ret)) {
         LOG("FetchMany_wrap: Error when binding columns - SQLRETURN=%d", ret);
         return ret;
@@ -4745,6 +4874,7 @@ SQLRETURN FetchArrowBatch_wrap(
 
     if (!hasLobColumns && fetchSize > 0) {
         // Bind columns
+        // Arrow path doesn't have per-connection charEncoding, use default "utf-8"
         ret = SQLBindColums(hStmt, buffers, columnNames, numCols, fetchSize);
         if (!SQL_SUCCEEDED(ret)) {
             LOG("Error when binding columns");
@@ -5573,7 +5703,7 @@ SQLRETURN FetchAll_wrap(SqlHandlePtr StatementHandle, py::list& rows,
     ColumnBuffers buffers(numCols, fetchSize);
 
     // Bind columns
-    ret = SQLBindColums(hStmt, buffers, columnNames, numCols, fetchSize);
+    ret = SQLBindColums(hStmt, buffers, columnNames, numCols, fetchSize, charEncoding);
     if (!SQL_SUCCEEDED(ret)) {
         LOG("FetchAll_wrap: Error when binding columns - SQLRETURN=%d", ret);
         return ret;
diff --git a/tests/test_013_encoding_decoding.py b/tests/test_013_encoding_decoding.py
index 034afae68..937feb6d4 100644
--- a/tests/test_013_encoding_decoding.py
+++ b/tests/test_013_encoding_decoding.py
@@ -7256,5 +7256,122 @@ def test_dae_encoding_large_string(db_connection):
         cursor.close()
 
 
+def test_varchar_utf8_collation_unicode_roundtrip(db_connection):
+    """Test that VARCHAR columns with UTF-8 collation properly round-trip Unicode data.
+
+    This tests the scenario where a VARCHAR column uses a UTF-8 collation
+    (e.g., Latin1_General_100_CI_AS_SC_UTF8) which enables storing full Unicode
+    in VARCHAR. The ODBC driver on Windows converts SQL_C_CHAR data to the
+    system ANSI code page (e.g., CP1252), which is lossy for non-Latin characters.
+    The fix fetches such columns as SQL_C_WCHAR (UTF-16LE) to preserve all Unicode.
+
+    Covers: fetchone, fetchall, fetchmany paths.
+    """
+    cursor = db_connection.cursor()
+
+    try:
+        # Create table with UTF-8 collation on VARCHAR column
+        cursor.execute("""
+            CREATE TABLE #test_varchar_utf8_collation (
+                id INT PRIMARY KEY,
+                varchar_utf8 VARCHAR(200) COLLATE Latin1_General_100_CI_AS_SC_UTF8,
+                nvarchar_ref NVARCHAR(200)
+            )
+        """)
+
+        # Configure UTF-8 decoding for SQL_CHAR (VARCHAR)
+        db_connection.setdecoding(SQL_CHAR, encoding="utf-8")
+        db_connection.setdecoding(SQL_WCHAR, encoding="utf-16le")
+
+        # Test cases covering BMP and supplementary plane characters
+        test_cases = [
+            (1, "Hello World"),           # ASCII baseline
+            (2, "Grüße"),                 # German - extended Latin (in CP1252 range)
+            (3, "你好世界"),               # Chinese - outside CP1252
+            (4, "こんにちは"),             # Japanese Hiragana - outside CP1252
+            (5, "Привет"),               # Russian Cyrillic - outside CP1252
+            (6, "Hello 世界"),             # Mixed ASCII + CJK
+            (7, "😀😃😄😁"),             # Emoji - supplementary plane (4-byte UTF-8)
+            (8, "Ελληνικά"),             # Greek
+            (9, "مرحبا"),                # Arabic
+            (10, "café résumé naïve"),    # French accented
+        ]
+
+        # Insert using parameterized queries
+        for id_val, text in test_cases:
+            cursor.execute(
+                "INSERT INTO #test_varchar_utf8_collation (id, varchar_utf8, nvarchar_ref) "
+                "VALUES (?, ?, ?)",
+                id_val, text, text,
+            )
+
+        # ---- Test fetchone path ----
+        for id_val, expected_text in test_cases:
+            cursor.execute(
+                "SELECT varchar_utf8, nvarchar_ref FROM #test_varchar_utf8_collation WHERE id = ?",
+                id_val,
+            )
+            row = cursor.fetchone()
+            assert row is not None, f"No row returned for id={id_val}"
+
+            varchar_result = row[0]
+            nvarchar_result = row[1]
+
+            # NVARCHAR should always work (baseline check)
+            assert nvarchar_result == expected_text, (
+                f"NVARCHAR mismatch for id={id_val}: "
+                f"expected {expected_text!r}, got {nvarchar_result!r}"
+            )
+
+            # VARCHAR with UTF-8 collation should also return correct str
+            assert isinstance(varchar_result, str), (
+                f"VARCHAR UTF-8 returned {type(varchar_result).__name__} instead of str "
+                f"for id={id_val} ({expected_text!r}): got {varchar_result!r}"
+            )
+            assert varchar_result == expected_text, (
+                f"VARCHAR UTF-8 mismatch for id={id_val}: "
+                f"expected {expected_text!r}, got {varchar_result!r}"
+            )
+
+        # ---- Test fetchall path ----
+        cursor.execute(
+            "SELECT id, varchar_utf8, nvarchar_ref "
+            "FROM #test_varchar_utf8_collation ORDER BY id"
+        )
+        all_rows = cursor.fetchall()
+        assert len(all_rows) == len(test_cases), (
+            f"fetchall row count mismatch: expected {len(test_cases)}, got {len(all_rows)}"
+        )
+        for row, (expected_id, expected_text) in zip(all_rows, test_cases):
+            assert row[1] == expected_text, (
+                f"fetchall VARCHAR UTF-8 mismatch for id={expected_id}: "
+                f"expected {expected_text!r}, got {row[1]!r}"
+            )
+            assert row[2] == expected_text, (
+                f"fetchall NVARCHAR mismatch for id={expected_id}: "
+                f"expected {expected_text!r}, got {row[2]!r}"
+            )
+
+        # ---- Test fetchmany path ----
+        cursor.execute(
+            "SELECT id, varchar_utf8, nvarchar_ref "
+            "FROM #test_varchar_utf8_collation ORDER BY id"
+        )
+        many_rows = cursor.fetchmany(5)
+        assert len(many_rows) == 5, f"fetchmany(5) returned {len(many_rows)} rows"
+        for row, (expected_id, expected_text) in zip(many_rows, test_cases[:5]):
+            assert row[1] == expected_text, (
+                f"fetchmany VARCHAR UTF-8 mismatch for id={expected_id}: "
+                f"expected {expected_text!r}, got {row[1]!r}"
+            )
+
+    finally:
+        try:
+            cursor.execute("DROP TABLE #test_varchar_utf8_collation")
+        except:
+            pass
+        cursor.close()
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])

From 1e3e0922273ae9ed6f589228016e3d9ba9766067 Mon Sep 17 00:00:00 2001
From: Jahnvi Thakkar <jathakkar@microsoft.com>
Date: Mon, 27 Apr 2026 16:09:38 +0530
Subject: [PATCH 2/2] Formatting test file

---
 tests/test_013_encoding_decoding.py | 36 ++++++++++++++---------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/tests/test_013_encoding_decoding.py b/tests/test_013_encoding_decoding.py
index 937feb6d4..867228015 100644
--- a/tests/test_013_encoding_decoding.py
+++ b/tests/test_013_encoding_decoding.py
@@ -7285,16 +7285,16 @@ def test_varchar_utf8_collation_unicode_roundtrip(db_connection):
 
         # Test cases covering BMP and supplementary plane characters
         test_cases = [
-            (1, "Hello World"),           # ASCII baseline
-            (2, "Grüße"),                 # German - extended Latin (in CP1252 range)
-            (3, "你好世界"),               # Chinese - outside CP1252
-            (4, "こんにちは"),             # Japanese Hiragana - outside CP1252
-            (5, "Привет"),               # Russian Cyrillic - outside CP1252
-            (6, "Hello 世界"),             # Mixed ASCII + CJK
-            (7, "😀😃😄😁"),             # Emoji - supplementary plane (4-byte UTF-8)
-            (8, "Ελληνικά"),             # Greek
-            (9, "مرحبا"),                # Arabic
-            (10, "café résumé naïve"),    # French accented
+            (1, "Hello World"),  # ASCII baseline
+            (2, "Grüße"),  # German - extended Latin (in CP1252 range)
+            (3, "你好世界"),  # Chinese - outside CP1252
+            (4, "こんにちは"),  # Japanese Hiragana - outside CP1252
+            (5, "Привет"),  # Russian Cyrillic - outside CP1252
+            (6, "Hello 世界"),  # Mixed ASCII + CJK
+            (7, "😀😃😄😁"),  # Emoji - supplementary plane (4-byte UTF-8)
+            (8, "Ελληνικά"),  # Greek
+            (9, "مرحبا"),  # Arabic
+            (10, "café résumé naïve"),  # French accented
         ]
 
         # Insert using parameterized queries
@@ -7302,7 +7302,9 @@ def test_varchar_utf8_collation_unicode_roundtrip(db_connection):
             cursor.execute(
                 "INSERT INTO #test_varchar_utf8_collation (id, varchar_utf8, nvarchar_ref) "
                 "VALUES (?, ?, ?)",
-                id_val, text, text,
+                id_val,
+                text,
+                text,
             )
 
         # ---- Test fetchone path ----
@@ -7335,13 +7337,12 @@ def test_varchar_utf8_collation_unicode_roundtrip(db_connection):
 
         # ---- Test fetchall path ----
         cursor.execute(
-            "SELECT id, varchar_utf8, nvarchar_ref "
-            "FROM #test_varchar_utf8_collation ORDER BY id"
+            "SELECT id, varchar_utf8, nvarchar_ref " "FROM #test_varchar_utf8_collation ORDER BY id"
         )
         all_rows = cursor.fetchall()
-        assert len(all_rows) == len(test_cases), (
-            f"fetchall row count mismatch: expected {len(test_cases)}, got {len(all_rows)}"
-        )
+        assert len(all_rows) == len(
+            test_cases
+        ), f"fetchall row count mismatch: expected {len(test_cases)}, got {len(all_rows)}"
         for row, (expected_id, expected_text) in zip(all_rows, test_cases):
             assert row[1] == expected_text, (
                 f"fetchall VARCHAR UTF-8 mismatch for id={expected_id}: "
@@ -7354,8 +7355,7 @@ def test_varchar_utf8_collation_unicode_roundtrip(db_connection):
 
         # ---- Test fetchmany path ----
         cursor.execute(
-            "SELECT id, varchar_utf8, nvarchar_ref "
-            "FROM #test_varchar_utf8_collation ORDER BY id"
+            "SELECT id, varchar_utf8, nvarchar_ref " "FROM #test_varchar_utf8_collation ORDER BY id"
         )
         many_rows = cursor.fetchmany(5)
         assert len(many_rows) == 5, f"fetchmany(5) returned {len(many_rows)} rows"