Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

encoding.iconv: Fix iconv type cstrict, add LOCAL encoding #22398

Merged
merged 7 commits into from
Oct 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 36 additions & 14 deletions vlib/encoding/iconv/iconv.v
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,35 @@ fn reverse_u32(src u32) u32 {
// vstring_to_encoding convert V string `str` to `tocode` encoding string
// tips: use `iconv --list` check for supported encodings
pub fn vstring_to_encoding(str string, tocode string) ![]u8 {
encoding_name := tocode.to_upper()
mut encoding_name := tocode.to_upper()
if encoding_name in ['UTF16', 'UTF32', 'UTF-16', 'UTF-32']! {
return error('please use UTF16-LE/UTF-16BE/UTF-32LE/UTF-32BE instead')
}
return conv(tocode, 'UTF-8', str.str, str.len)
if encoding_name == 'LOCAL' {
$if windows {
encoding_name = 'ANSI'
} $else {
encoding_name = 'UTF-8'
}
}
return conv(encoding_name, 'UTF-8', str.str, str.len)
}

// encoding_to_vstring converts the given `bytes` using `fromcode` encoding, to a V string (encoded with UTF-8)
// tips: use `iconv --list` check for supported encodings
pub fn encoding_to_vstring(bytes []u8, fromcode string) !string {
encoding_name := fromcode.to_upper()
mut encoding_name := fromcode.to_upper()
if encoding_name in ['UTF16', 'UTF32', 'UTF-16', 'UTF-32']! {
return error('please use UTF16-LE/UTF-16BE/UTF-32LE/UTF-32BE instead')
}
mut dst := conv('UTF-8', fromcode, bytes.data, bytes.len)!
if encoding_name == 'LOCAL' {
$if windows {
encoding_name = 'ANSI'
} $else {
encoding_name = 'UTF-8'
}
}
mut dst := conv('UTF-8', encoding_name, bytes.data, bytes.len)!
dst << 0 // add a tail zero, to build a vstring
return unsafe { cstring_to_vstring(dst.data) }
}
Expand All @@ -43,7 +57,15 @@ pub fn encoding_to_vstring(bytes []u8, fromcode string) !string {
// for utf32be, it will prepend 0x0000FEFF to the `src`
pub fn create_utf_string_with_bom(src []u8, utf_type string) []u8 {
mut clone := src.clone()
match utf_type.to_upper() {
mut encoding_name := utf_type.to_upper()
if encoding_name == 'LOCAL' {
$if windows {
encoding_name = 'ANSI'
} $else {
encoding_name = 'UTF-8'
}
}
match encoding_name {
'UTF8', 'UTF-8' {
clone.prepend([u8(0xEF), 0xBB, 0xBF])
}
Expand Down Expand Up @@ -73,7 +95,15 @@ pub fn create_utf_string_with_bom(src []u8, utf_type string) []u8 {
@[direct_array_access]
pub fn remove_utf_string_with_bom(src []u8, utf_type string) []u8 {
mut clone := src.clone()
match utf_type.to_upper() {
mut encoding_name := utf_type.to_upper()
if encoding_name == 'LOCAL' {
$if windows {
encoding_name = 'ANSI'
} $else {
encoding_name = 'UTF-8'
}
}
match encoding_name {
'UTF8', 'UTF-8' {
if clone.len > 3 {
if clone[0] == u8(0xEF) && clone[1] == u8(0xBB) && clone[2] == u8(0xBF) {
Expand Down Expand Up @@ -119,10 +149,6 @@ pub fn remove_utf_string_with_bom(src []u8, utf_type string) []u8 {
// write_file_encoding write_file convert `text` into `encoding` and writes to a file with the given `path`. If `path` already exists, it will be overwritten.
// For `encoding` in UTF8/UTF16/UTF32, if `bom` is true, then a BOM header will write to the file.
pub fn write_file_encoding(path string, text string, encoding string, bom bool) ! {
encoding_name := encoding.to_upper()
if encoding_name in ['UTF16', 'UTF32', 'UTF-16', 'UTF-32']! {
return error('please use UTF-16LE/UTF-16BE/UTF-32LE/UTF-32BE instead')
}
encoding_bytes := vstring_to_encoding(text, encoding)!
if bom && encoding.to_upper().starts_with('UTF') {
encoding_bom_bytes := create_utf_string_with_bom(encoding_bytes, encoding)
Expand All @@ -134,10 +160,6 @@ pub fn write_file_encoding(path string, text string, encoding string, bom bool)

// read_file_encoding reads the file in `path` with `encoding` and returns the contents
pub fn read_file_encoding(path string, encoding string) !string {
encoding_name := encoding.to_upper()
if encoding_name in ['UTF16', 'UTF32', 'UTF-16', 'UTF-32']! {
return error('please use UTF-16LE/UTF-16BE/UTF-32LE/UTF-32BE instead')
}
encoding_bytes := os.read_file_array[u8](path)
encoding_without_bom_bytes := remove_utf_string_with_bom(encoding_bytes, encoding)
return encoding_to_vstring(encoding_without_bom_bytes, encoding)!
Expand Down
10 changes: 5 additions & 5 deletions vlib/encoding/iconv/iconv_nix.c.v
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ module iconv
#include <iconv.h>
#flag darwin -liconv

fn C.iconv_open(tocode &u8, fromcode &u8) voidptr
fn C.iconv_open(tocode charptr, fromcode charptr) voidptr
fn C.iconv_close(cd voidptr) int
fn C.iconv(cd voidptr, inbuf &&u8, inbytesleft &usize, outbuf &&u8, outbytesleft &usize) usize
fn C.iconv(cd voidptr, inbuf &charptr, inbytesleft &usize, outbuf &charptr, outbytesleft &usize) usize

// conv convert `fromcode` encoding string to `tocode` encoding string
@[direct_array_access]
Expand Down Expand Up @@ -35,16 +35,16 @@ fn conv(tocode string, fromcode string, src &u8, src_len int) ![]u8 {
else {}
}

mut cd := C.iconv_open(dst_encoding.str, src_encoding.str)
mut cd := C.iconv_open(charptr(dst_encoding.str), charptr(src_encoding.str))
if isize(cd) == -1 {
return error('platform can\'t convert from ${src_encoding} to ${dst_encoding}')
}
defer { C.iconv_close(cd) }

mut dst := []u8{len: (src_len + 1) * 4} // this should be enough to hold the dst encoding string

mut src_ptr := &u8(src)
mut dst_ptr := &u8(dst.data)
mut src_ptr := charptr(src)
mut dst_ptr := charptr(dst.data)
mut src_left := usize(src_len)
mut dst_left := usize(dst.len)
res := C.iconv(cd, &src_ptr, &src_left, &dst_ptr, &dst_left)
Expand Down
10 changes: 10 additions & 0 deletions vlib/encoding/iconv/iconv_test.v
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ fn test_vstring_to_encoding() {
abc_utf32be := iconv.vstring_to_encoding('abc', 'UTF-32BE')!
assert abc_utf32be == [u8(0), 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99]

abc_local := iconv.vstring_to_encoding('abc', 'LOCAL')!
// Windows LOCAL: ANSI encoding
// Linux LOCAL: UTF-8 encoding
assert abc_local == [u8(97), 98, 99]

if abc_not_exist := iconv.vstring_to_encoding('abc', 'encoding_not_exist') {
assert false, 'encoding_not_exist'
}
Expand Down Expand Up @@ -53,6 +58,11 @@ fn test_encoding_to_vstring() {
'UTF-32BE')!
assert abc_utf32be == 'abc'

abc_local := iconv.encoding_to_vstring([u8(97), 98, 99], 'LOCAL')!
// Windows LOCAL: ANSI encoding
// Linux LOCAL: UTF-8 encoding
assert abc_local == 'abc'

if abc_not_exist := iconv.encoding_to_vstring([u8(97), 98, 99], 'encoding_not_exist') {
assert false, 'encoding_not_exist'
}
Expand Down
Loading