utf8-f_detail.f90 Source File


Contents

Source Code


Source Code

module utf8_detail
    use utf8_const
    implicit none
    public

    type :: utf8_string
        private
        character(len=:, kind=c_char), allocatable :: str
    contains
        procedure :: iterator
    end type

    type :: utf8_string_iterator
        private
        character(len=:, kind=c_char), pointer :: ptr => null()
        integer :: cur = 1
    contains
        procedure :: has_next => iterator_has_next
        procedure :: get_next => iterator_get_next
    end type

    interface construct_utf8_string
        module procedure :: utf8_construct_from_char
    end interface

    interface assignment(=)
        module procedure :: utf8_assign_from_char
        module procedure :: char_assign_from_utf8
    end interface

    interface utf8_is_valid
        module procedure :: utf8_is_valid_char
        module procedure :: utf8_is_valid_string
    end interface

contains

    subroutine utf8_construct_from_char(utf8, str, escape)
        type(utf8_string), intent(out) :: utf8
        character(len=*, kind=c_char), intent(in) :: str
        logical, optional, intent(in) :: escape
            !! if parse unicode escape sequence \uXXXX, default is .false.
        logical :: parse

        if (present(escape)) then
            parse = escape
        else
            parse = .false.
        end if

        if (parse) then
            ! to be implemented
            allocate (utf8%str, source=str)
        else
            allocate (utf8%str, source=str)
        end if

    end subroutine utf8_construct_from_char

    subroutine utf8_assign_from_char(utf8, str)
        type(utf8_string), intent(out) :: utf8
        character(len=*, kind=c_char), intent(in) :: str

        call utf8_construct_from_char(utf8, str, .false.)

    end subroutine utf8_assign_from_char

    subroutine char_assign_from_utf8(str, utf8)
        character(len=:, kind=c_char), allocatable, intent(out) :: str
        type(utf8_string), intent(in) :: utf8

        if (allocated(utf8%str)) then
            allocate (str, source=utf8%str)
        else
            allocate (character(len=0, kind=c_char) :: str)
        end if

    end subroutine char_assign_from_utf8

    !> return the number of UTF-8 code points
    pure function utf8_len(utf8) result(l)
        class(utf8_string), intent(in) :: utf8
        integer :: l
        integer :: i

        l = 0; i = 1
        do
            if (i > len(utf8%str)) exit
            i = i + codepoint_num_bytes(cast_byte(utf8%str(i:i)))
            l = l + 1
        end do

    end function utf8_len

    !> return the code point at specified position
    pure function utf8_at(utf8, idx) result(s)
        class(utf8_string), intent(in) :: utf8
        integer, intent(in) :: idx
        character(len=:, kind=c_char), allocatable :: s
        integer :: i, j, n

        if (idx < 1) then
            allocate (character(len=0, kind=c_char) :: s); return
        end if

        i = 1; j = 1
        do
            if (i > len(utf8%str)) then
                allocate (character(len=0, kind=c_char) :: s); return
            end if
            n = codepoint_num_bytes(cast_byte(utf8%str(i:i)))
            if (j == idx) then
                allocate (s, source=utf8%str(i:i + n - 1)); return
            end if
            i = i + n
            j = j + 1
        end do

    end function utf8_at

    !> return a substring of utf8_string
    pure function utf8_slice(utf8, begin, end) result(slice)
        class(utf8_string), intent(in) :: utf8
        integer, intent(in) :: begin
        integer, intent(in) :: end
        character(len=:, kind=c_char), allocatable :: slice
        integer :: bi, bj ! byte index for begin (i) and end (j)
        integer :: ci, cj ! codepoint index for begin (i) and end (j)
        integer :: n

        if (begin > end .or. begin > len(utf8%str) .or. end < 1) then
            allocate (character(len=0, kind=c_char) :: slice); return
        end if

        bi = 1; ci = 1
        if (begin > 1) then
            do
                if (ci == begin) exit
                if (bi > len(utf8%str)) then
                    allocate (character(len=0, kind=c_char) :: slice); return
                end if
                n = codepoint_num_bytes(cast_byte(utf8%str(bi:bi)))
                bi = bi + n
                ci = ci + 1
            end do
        end if

        bj = bi; cj = ci
        do
            if (bj > len(utf8%str)) then
                allocate (slice, source=utf8%str(bi:)); return
            end if
            n = codepoint_num_bytes(cast_byte(utf8%str(bi:bi)))
            if (cj == end) then
                allocate (slice, source=utf8%str(bi:bj + n - 1)); return
            end if
            bj = bj + n
            cj = cj + 1
        end do

    end function utf8_slice

    !> return the position where substring occurs in utf8_string for the first time
    pure function utf8_index(utf8, substring) result(idx)
        class(utf8_string), intent(in) :: utf8
        character(len=*, kind=c_char), intent(in) :: substring
        integer :: idx
        integer :: bit, cit
        integer :: nt, ls

        idx = 0
        bit = 1; cit = 1
        ls = len(substring)
        do
            if (bit + ls - 1 > len(utf8%str)) exit
            if (utf8%str(bit:bit + ls - 1) == substring(:)) then
                idx = cit; return
            end if
            nt = codepoint_num_bytes(cast_byte(utf8%str(bit:bit)))
            bit = bit + nt
            cit = cit + 1
        end do

    end function utf8_index

    !> count the substring in utf8_string
    !> overlaps are not considered
    !> e.g. utf8_count("AUAUAUAUAUAUAU","AUA") returns 3
    pure function utf8_count(utf8, substring) result(count)
        class(utf8_string), intent(in) :: utf8
        character(len=*, kind=c_char), intent(in) :: substring
        integer :: count

        if (.not. utf8_is_valid(substring) .or. len(substring) == 0) then
            count = 0; return
        end if

        count = count_internal(utf8%str, substring)

    contains

        pure recursive function count_internal(full, sub) result(c)
            character(len=*, kind=c_char), intent(in) :: full
            character(len=*, kind=c_char), intent(in) :: sub
            integer :: c
            integer :: l, idx

            l = len(sub)
            idx = index(full, sub)
            if (idx == 0) then
                c = 0; return
            else
                c = count_internal(full(idx + l:), sub) + 1
            end if

        end function count_internal

    end function utf8_count

    !> split utf8_string based on the separation string
    subroutine utf8_split(utf8, sep, list)
        class(utf8_string), intent(in), target :: utf8
        character(len=*, kind=c_char), intent(in) :: sep
        type(utf8_string), dimension(:), allocatable, intent(out) :: list
        character(len=:, kind=c_char), pointer :: ptr
        integer :: nsep, l
        integer :: i, e

        nsep = utf8_count(utf8, sep)
        allocate(list(nsep+1))

        if (nsep == 0) then
            allocate(list(1)%str, source=utf8%str)
        else
            l = len(sep)
            e = 1
            ptr => utf8%str(:)
            do i = 1, nsep
                e = index(ptr, sep)
                allocate(list(i)%str, source=ptr(:e-1))
                ptr => ptr(e+l:)
            end do
            allocate(list(nsep+1)%str, source=ptr(:))
        end if

    end subroutine utf8_split

    !> reverse the order of code points in place
    subroutine utf8_reverse(utf8)
        class(utf8_string), intent(inout) :: utf8
        character(len=:, kind=c_char), allocatable :: tmp
        integer :: i, j, l, n

        l = len(utf8%str)
        call move_alloc(from=utf8%str, to=tmp)
        allocate (character(len=l, kind=c_char) :: utf8%str)

        i = 1; j = l
        do
            if (i > l) exit
            n = codepoint_num_bytes(cast_byte(tmp(i:i)))
            utf8%str(j - n + 1:j) = tmp(i:i + n - 1)
            i = i + n
            j = j - n
        end do

    end subroutine utf8_reverse

    !> return an iterator of utf8_string
    function iterator(this) result(itr)
        class(utf8_string), target, intent(in) :: this
        type(utf8_string_iterator) :: itr

        itr%ptr => this%str

    end function iterator

!> iterator methods

    pure function iterator_has_next(this) result(r)
        class(utf8_string_iterator), intent(in) :: this
        logical :: r

        r = this%cur <= len(this%ptr)

    end function iterator_has_next

    function iterator_get_next(this) result(cp)
        class(utf8_string_iterator), intent(inout) :: this
        !character(len=:, kind=c_char), pointer :: cp
        character(len=:, kind=c_char), allocatable :: cp
        integer :: n

        n = codepoint_num_bytes(cast_byte(this%ptr(this%cur:this%cur)))
        cp = this%ptr(this%cur:this%cur + n - 1)
        this%cur = this%cur + n

    end function iterator_get_next

    !> check if the whole string is valid utf8 encoding
    pure function utf8_is_valid_char(str) result(r)
        character(len=*, kind=c_char), intent(in) :: str
        logical :: r
        integer(kind=c_int8_t) :: byte
        integer :: i, n

        i = 1
        do
            if (i > len(str)) exit
            byte = cast_byte(str(i:i))
            if (iand(byte, int(z'80', c_int8_t)) == int(z'00', c_int8_t)) then
                ! first byte: 0xxxxxxx and 00..7F
                i = i + 1
            else if (iand(byte, int(z'E0', c_int8_t)) == int(z'C0', c_int8_t) .and. &
                     iand(byte, int(z'1F', c_int8_t)) > int(z'01', c_int8_t)) then
                ! first byte: 110yyyyy and C2..DF
                if (i + 1 > len(str)) then
                    r = .false.; return
                end if
                byte = cast_byte(str(i + 1:i + 1))
                if (iand(byte, int(z'C0', c_int8_t)) /= int(z'80', c_int8_t)) then
                    ! secpnd byte: 10xxxxxx and 80..BF
                    r = .false.; return
                end if
                i = i + 2
            else if (iand(byte, int(z'F0', c_int8_t)) == int(z'E0', c_int8_t)) then
                ! first byte: 1110zzzz and E0..EF
                if (i + 2 > len(str)) then
                    r = .false.; return
                end if
                if (iand(cast_byte(str(i + 1:i + 1)), int(z'C0', c_int8_t)) /= int(z'80', c_int8_t) .or. &
                    iand(cast_byte(str(i + 2:i + 2)), int(z'C0', c_int8_t)) /= int(z'80', c_int8_t)) then
                    ! second and third bytes: 10xxxxxx
                    r = .false.; return
                end if
                if (byte == int(z'E0', c_int8_t)) then
                    if (iand(cast_byte(str(i + 1:i + 1)), int(z'3F', c_int8_t)) < int(z'20', c_int8_t)) then
                        ! E0  A0..BF  80..BF
                        r = .false.; return
                    end if
                end if
                if (byte == int(z'ED', c_int8_t)) then
                    if (iand(cast_byte(str(i + 1:i + 1)), int(z'3F', c_int8_t)) > int(z'1F', c_int8_t)) then
                        ! ED  80..9F  80..BF
                        r = .false.; return
                    end if
                end if
                i = i + 3
            else if (iand(byte, int(z'F8', c_int8_t)) == int(z'F0', c_int8_t) .and. &
                     iand(byte, int(z'07', c_int8_t)) < int(z'05', c_int8_t)) then
                ! first byte: 11110uuu and F0..F4
                if (i + 3 > len(str)) then
                    r = .false.; return
                end if
                if (iand(cast_byte(str(i + 1:i + 1)), int(z'C0', c_int8_t)) /= int(z'80', c_int8_t) .or. &
                    iand(cast_byte(str(i + 2:i + 2)), int(z'C0', c_int8_t)) /= int(z'80', c_int8_t) .or. &
                    iand(cast_byte(str(i + 3:i + 3)), int(z'C0', c_int8_t)) /= int(z'80', c_int8_t)) then
                    ! second, third, and last bytes: 10xxxxxx
                    r = .false.; return
                end if
                if (byte == int(z'F0', c_int8_t)) then
                    if (iand(cast_byte(str(i + 1:i + 1)), int(z'3F', c_int8_t)) < int(z'10', c_int8_t)) then
                        ! F0  90..BF  80..BF  80..BF
                        r = .false.; return
                    end if
                end if
                if (byte == int(z'F4', c_int8_t)) then
                    if (iand(cast_byte(str(i + 1:i + 1)), int(z'3F', c_int8_t)) > int(z'0F', c_int8_t)) then
                        ! F4  80..8F  80..BF  80..BF
                        r = .false.; return
                    end if
                end if
                i = i + 4
            else
                r = .false.; return
            end if
        end do

        r = .true.

    end function utf8_is_valid_char

    pure function utf8_is_valid_string(str) result(r)
        type(utf8_string), intent(in) :: str
        logical :: r

        r = utf8_is_valid_char(str%str)

    end function utf8_is_valid_string

!> private helper functions

    !> cast char to byte (8-bits integer in Fortran)
    !> display: private
    pure function cast_byte(char) result(byte)
        character(kind=c_char, len=1), intent(in) :: char
        integer(kind=c_int8_t) :: byte

        byte = transfer(char, byte)

    end function cast_byte

    !> get the number of bytes of a code point based on its first byte
    !> display: private
    pure function codepoint_num_bytes(byte) result(n)
        integer(kind=c_int8_t), intent(in) :: byte
        integer :: n

        n = NUM_BYTES_UTF8(iand(int(byte, 4), int(z'000000FF', 4)))
        if (n == 0) n = 1

    end function codepoint_num_bytes

end module utf8_detail