From ad9bae788ede47a1f781d23073c3f0946d4f2e2f Mon Sep 17 00:00:00 2001 From: starainrt Date: Thu, 15 Jul 2021 11:09:32 +0800 Subject: [PATCH] first commit --- gbk.go | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++++ gbk_test.go | 31 ++++++++++++++++++ 2 files changed, 121 insertions(+) create mode 100644 gbk.go create mode 100644 gbk_test.go diff --git a/gbk.go b/gbk.go new file mode 100644 index 0000000..8e68f79 --- /dev/null +++ b/gbk.go @@ -0,0 +1,90 @@ +package startext + +import "golang.org/x/text/encoding/simplifiedchinese" + +func IsUtf8(data []byte) bool { + return isUtf8(data) +} + +func IsGBK(data []byte) bool { + return (!isUtf8(data)) && isGBK(data) +} + +func isGBK(data []byte) bool { + length := len(data) + var i int = 0 + for i < length { + if data[i] <= 0x7f { + //编码0~127,只有一个字节的编码,兼容ASCII码 + i++ + continue + } else { + //大于127的使用双字节编码,落在gbk编码范围内的字符 + if data[i] >= 0x81 && + data[i] <= 0xfe && + data[i+1] >= 0x40 && + data[i+1] <= 0xfe && + data[i+1] != 0x7f { + i += 2 + continue + } else { + return false + } + } + } + return true +} + +//UTF-8编码格式的判断 + +func preNUm(data byte) int { + var mask byte = 0x80 + var num int = 0 + //8bit中首个0bit前有多少个1bits + for i := 0; i < 8; i++ { + if (data & mask) == mask { + num++ + mask = mask >> 1 + } else { + break + } + } + return num +} +func isUtf8(data []byte) bool { + i := 0 + for i < len(data) { + if (data[i] & 0x80) == 0x00 { + // 0XXX_XXXX + i++ + continue + } else if num := preNUm(data[i]); num > 2 { + // 110X_XXXX 10XX_XXXX + // 1110_XXXX 10XX_XXXX 10XX_XXXX + // 1111_0XXX 10XX_XXXX 10XX_XXXX 10XX_XXXX + // 1111_10XX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX + // 1111_110X 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX + // preNUm() 返回首个字节的8个bits中首个0bit前面1bit的个数,该数量也是该字符所使用的字节数 + i++ + for j := 0; j < num-1; j++ { + //判断后面的 num - 1 个字节是不是都是10开头 + if (data[i] & 0xc0) != 0x80 { + return false + } + i++ + } + } else { + //其他情况说明不是utf-8 + return false + } + } + return true +} + +func GBK2UTF8(data []byte) ([]byte,error) { + return simplifiedchinese.GBK.NewDecoder().Bytes(data) +} + +func UTF82GBK(data []byte) ([]byte,error) { + return simplifiedchinese.GBK.NewEncoder().Bytes(data) +} \ No newline at end of file diff --git a/gbk_test.go b/gbk_test.go new file mode 100644 index 0000000..22fd6e8 --- /dev/null +++ b/gbk_test.go @@ -0,0 +1,31 @@ +package startext + +import ( + "fmt" + "testing" +) + +func Test_Gbk(t *testing.T) { + str:="你好" + if IsGBK([]byte(str)) { + t.Fail() + } + if !IsUtf8([]byte(str)) { + t.Fail() + } + gbk,err:=UTF82GBK([]byte(str)) + if err!=nil{ + t.Fatal(err) + } + if !IsGBK(gbk) { + t.Fail() + } + if IsUtf8(gbk) { + t.Fail() + } + utf8,err:=GBK2UTF8(gbk) + if err!=nil{ + t.Fatal(err) + } + fmt.Println(string(utf8)) +}