utf8_is_valid_char Function

public pure function utf8_is_valid_char(str) result(r)

check if the whole string is valid utf8 encoding

Arguments

TypeIntentOptionalAttributesName
character(kind=c_char,len=*), intent(in) :: str

Return Value logical


Contents

Source Code


Source Code

    pure function utf8_is_valid_char(str) result(r)
        character(len=*, kind=c_char), intent(in) :: str
        logical :: r
        integer(kind=c_int8_t) :: byte
        integer :: i, n

        i = 1
        do
            if (i > len(str)) exit
            byte = cast_byte(str(i:i))
            if (iand(byte, int(z'80', c_int8_t)) == int(z'00', c_int8_t)) then
                ! first byte: 0xxxxxxx and 00..7F
                i = i + 1
            else if (iand(byte, int(z'E0', c_int8_t)) == int(z'C0', c_int8_t) .and. &
                     iand(byte, int(z'1F', c_int8_t)) > int(z'01', c_int8_t)) then
                ! first byte: 110yyyyy and C2..DF
                if (i + 1 > len(str)) then
                    r = .false.; return
                end if
                byte = cast_byte(str(i + 1:i + 1))
                if (iand(byte, int(z'C0', c_int8_t)) /= int(z'80', c_int8_t)) then
                    ! secpnd byte: 10xxxxxx and 80..BF
                    r = .false.; return
                end if
                i = i + 2
            else if (iand(byte, int(z'F0', c_int8_t)) == int(z'E0', c_int8_t)) then
                ! first byte: 1110zzzz and E0..EF
                if (i + 2 > len(str)) then
                    r = .false.; return
                end if
                if (iand(cast_byte(str(i + 1:i + 1)), int(z'C0', c_int8_t)) /= int(z'80', c_int8_t) .or. &
                    iand(cast_byte(str(i + 2:i + 2)), int(z'C0', c_int8_t)) /= int(z'80', c_int8_t)) then
                    ! second and third bytes: 10xxxxxx
                    r = .false.; return
                end if
                if (byte == int(z'E0', c_int8_t)) then
                    if (iand(cast_byte(str(i + 1:i + 1)), int(z'3F', c_int8_t)) < int(z'20', c_int8_t)) then
                        ! E0  A0..BF  80..BF
                        r = .false.; return
                    end if
                end if
                if (byte == int(z'ED', c_int8_t)) then
                    if (iand(cast_byte(str(i + 1:i + 1)), int(z'3F', c_int8_t)) > int(z'1F', c_int8_t)) then
                        ! ED  80..9F  80..BF
                        r = .false.; return
                    end if
                end if
                i = i + 3
            else if (iand(byte, int(z'F8', c_int8_t)) == int(z'F0', c_int8_t) .and. &
                     iand(byte, int(z'07', c_int8_t)) < int(z'05', c_int8_t)) then
                ! first byte: 11110uuu and F0..F4
                if (i + 3 > len(str)) then
                    r = .false.; return
                end if
                if (iand(cast_byte(str(i + 1:i + 1)), int(z'C0', c_int8_t)) /= int(z'80', c_int8_t) .or. &
                    iand(cast_byte(str(i + 2:i + 2)), int(z'C0', c_int8_t)) /= int(z'80', c_int8_t) .or. &
                    iand(cast_byte(str(i + 3:i + 3)), int(z'C0', c_int8_t)) /= int(z'80', c_int8_t)) then
                    ! second, third, and last bytes: 10xxxxxx
                    r = .false.; return
                end if
                if (byte == int(z'F0', c_int8_t)) then
                    if (iand(cast_byte(str(i + 1:i + 1)), int(z'3F', c_int8_t)) < int(z'10', c_int8_t)) then
                        ! F0  90..BF  80..BF  80..BF
                        r = .false.; return
                    end if
                end if
                if (byte == int(z'F4', c_int8_t)) then
                    if (iand(cast_byte(str(i + 1:i + 1)), int(z'3F', c_int8_t)) > int(z'0F', c_int8_t)) then
                        ! F4  80..8F  80..BF  80..BF
                        r = .false.; return
                    end if
                end if
                i = i + 4
            else
                r = .false.; return
            end if
        end do

        r = .true.

    end function utf8_is_valid_char