Up

module CamomileLibrary

: sig

Camomile's toplevel interface

#
module ConfigInt : sig

Type of configuration parametor

#
module type Type = sig

Configuration values

#
val datadir : string

Directory of compiled Unicode data

#
val charmapdir : string

Directory of compiled character mapping tables a la ISO

#
val unimapdir : string

Directory of camomile-style compiled character mapping table

#
val localedir : string

Directory of compiled locale data

end
end
#
module DefaultConfig : ConfigInt.Type

Default configuration.

Individual modules

#
module OOChannel : sig

Object Oriented Channel

#
class type ['a] obj_input_channel =
#
method close_in : unit -> unit
#
method get : unit -> 'a

Generic input channel Have the same interface of Polymorphic input channel of http://www.ocaml-programming.de/rec/IO-Classes.html All channels of Camomile having this interface must confirm the behaviour defined in the recommendation above.

#
class type ['a] obj_output_channel =
#
method close_out : unit -> unit

If close_oout cannot output all buffered objects, flush raises Failure

If flush cannot output all buffered objects, flush raises Failure

#
method flush : unit -> unit
#
method put : 'a -> unit

Generic output channel Have the same interface of Polymorphic output channel of http://www.ocaml-programming.de/rec/IO-Classes.html All channels of Camomile having this interface must confirm the behaviour defined in the recommendation above.

#
class ['a] channel_of_stream : 'a Stream.t -> ['a] obj_input_channel

Convert stream to obj_input_channel

#
val stream_of_channel : 'a #obj_input_channel -> 'a Stream.t

Convert obj_input_channel to stream

#
class type char_input_channel =
#
method input : string -> int -> int -> int
#
method close_in : unit -> unit

Character(byte) input channel. Have the same interface of octet input channel of http://www.ocaml-programming.de/rec/IO-Classes.html All channels of Camomile having this interface must confirm the behaviour defined in the recommendation above. In addition, all channels are assumed to be blocking. If you supply a non-blocking channel to Camomile API, the outcome is undefined.

#
class type char_output_channel =
#
method output : string -> int -> int -> int
#
method flush : unit -> unit
#
method close_out : unit -> unit

Character(byte) output channel. Have the same interface of octet input channel of http://www.ocaml-programming.de/rec/IO-Classes.html All channels of Camomile having this interface must confirm the behaviour defined in the recommendation above. In addition, all channels are assumed to be blocking. If you supply a non-blocking channel to Camomile API, the outcome is undefined.

#
class char_input_channel_of : char #obj_input_channel -> char_input_channel

Convert a polymorphic input channel to a character input channel

#
class char_obj_input_channel_of : char_input_channel -> [char] obj_input_channel

Convert a character input channel to a polymorphic input channel

#
class char_output_channel_of : char #obj_output_channel -> char_output_channel

Convert a polymorphic output channel to a character output channel

#
class char_obj_output_channel_of : char_output_channel -> [char] obj_output_channel

Convert a character output channel to a polymorphic output channel

#
class of_in_channel : Pervasives.in_channel -> char_input_channel

Convert an OCaml input channel to an OO-based character input channel

#
class of_out_channel : Pervasives.out_channel -> char_output_channel

Convert an OCaml output channel to an OO-based character output channel

end
#
module UChar : sig

Unicode (ISO-UCS) characters.

This module implements Unicode (actually ISO-UCS) characters. All 31-bit code points are allowed.

#
type t

Unicode characters. All 31bit code points are allowed.

#
exception Out_of_range
#
val char_of : t -> char

char_of u returns the Latin-1 representation of u. If u can not be represented by Latin-1, raises Out_of_range

#
val of_char : char -> t

of_char c returns the Unicode character of the Latin-1 character c

#
val code : t -> int

code u returns the Unicode code number of u. If the value can not be represented by a positive integer, raise Out_of_range

#
val chr : int -> t

code n returns the Unicode character with the code number n. If n >= 2^32 or n < 0, raises invalid_arg

#
external uint_code : t -> int = "%identity"

uint_code u returns the Unicode code number of u. The returned int is unsigned, that is, on 32-bits platforms, the sign bit is used for storing the 31-th bit of the code number.

#
val chr_of_uint : int -> t

chr_of_uint n returns the Unicode character of the code number n. n is interpreted as unsigned, that is, on 32-bits platforms, the sign bit is treated as the 31-th bit of the code number. If n exceed 31-bits values, then raise invalid_arg.

#
val eq : t -> t -> bool

Equality by code point comparison

#
val compare : t -> t -> int

compare u1 u2 returns, a value > 0 if u1 has a larger Unicode code number than u2, 0 if u1 and u2 are the same Unicode character, a value < 0 if u1 has a smaller Unicode code number than u2.

#
type uchar = t

Aliases of type t

#
val int_of : uchar -> int

Alias of uint_code

#
val of_int : int -> uchar

Alias of chr_of_uint

end
#
module USet : sig

Sets of Unicode characters, implemented as sets of intervals. The signature is mostly same to Set.S in stdlib

#
type t
#
val empty : t
#
val is_empty : t -> bool
#
val mem : UChar.t -> t -> bool
#
val add : UChar.t -> t -> t
#
val add_range : UChar.t -> UChar.t -> t -> t

add_range u1 u2 s adds the characters in the range u1 - u2 to s. The range is determined by the code point order.

#
val singleton : UChar.t -> t
#
val remove : UChar.t -> t -> t
#
val remove_range : UChar.t -> UChar.t -> t -> t

remove_range u1 u2 s removes the characters in the range u1 - u2 from s. The range is determined by the code point order.

#
val union : t -> t -> t
#
val inter : t -> t -> t
#
val diff : t -> t -> t
#
val compl : t -> t

compl s returns the compliment of s.

#
val compare : t -> t -> int
#
val equal : t -> t -> bool
#
val subset : t -> t -> bool
#
val from : UChar.t -> t -> t

from u s returns the set of elements of s whose code points are equal or greater than u.

#
val after : UChar.t -> t -> t

after u s returns the set of elements of s whose code points are greater than u.

#
val until : UChar.t -> t -> t

until u s returns the set of elements of s whose code points are equal or smaller than u.

#
val before : UChar.t -> t -> t

until u s returns the set of elements of s whose code points are smaller than u.

#
val iter : (UChar.t -> unit) -> t -> unit
#
val iter_range : (UChar.t -> UChar.t -> unit) -> t -> unit

iter_range proc s feeds the intervals contained in s to proc in increasing order. The intervals given to proc are always separated by the character not in s.

#
val fold : (UChar.t -> 'a -> 'a) -> t -> 'a -> 'a
#
val fold_range : (UChar.t -> UChar.t -> 'a -> 'a) -> t -> 'a -> 'a

fold_range f s x is equivalent to f u_i u_(i+1) (... (f u_3 u_4 (f u_1 u_2 x))) if s is consisted of the intervals u1-u2, u3-u4, ..., u_i-u_(i + 1) in increasing order. The intervals given to proc are always separated by the character not in s.

#
val for_all : (UChar.t -> bool) -> t -> bool
#
val exists : (UChar.t -> bool) -> t -> bool
#
val filter : (UChar.t -> bool) -> t -> t
#
val partition : (UChar.t -> bool) -> t -> t * t
#
val cardinal : t -> int
#
val elements : t -> UChar.t list
#
val ranges : t -> (UChar.t * UChar.t) list

The list of the intervals contained in the set. The returned intervals are always separated by the character not in s.

#
val min_elt : t -> UChar.t
#
val max_elt : t -> UChar.t
#
val choose : t -> UChar.t

Returns a element roughly in the middle of the set. It is not guaranteed to return the same element for the sets with the same elements

#
val uset_of_iset : ISet.t -> t
#
val iset_of_uset : t -> ISet.t
end
#
module UMap : sig
#
type 'a t

Maps over Unicode characters.

#
val empty : 'a t
#
val is_empty : 'a t -> bool
#
val add : ?eq:('a -> 'a -> bool) -> UChar.t -> 'a -> 'a t -> 'a t

add ?eq u v m returns the new map which is same to m except it maps u to some value v' which satisfies eq v v'. If eq is not supplied, structural equality is used.

#
val add_range : ?eq:('a -> 'a -> bool) -> UChar.t -> UChar.t -> 'a -> 'a t -> 'a t

add ?eq u1 u2 v m returns the new map which is same to m except it maps characters in the range u1-u2 to some value v' which satisfies eq v v'. If eq is not supplied, structural equality is used.

#
val find : UChar.t -> 'a t -> 'a
#
val remove : UChar.t -> 'a t -> 'a t
#
val remove_range : UChar.t -> UChar.t -> 'a t -> 'a t

remove_range u1 u2 m removes u1-u2 from the domain of m

#
val from : UChar.t -> 'a t -> 'a t

from u m restricts the domain of m to the characters whose code points are equal or greater than u.

#
val after : UChar.t -> 'a t -> 'a t

after u m restricts the domain of m to the characters whose code points are greater than u.

#
val until : UChar.t -> 'a t -> 'a t

until u m restricts the domain of m to the characters whose code points are equal or smaller than u.

#
val before : UChar.t -> 'a t -> 'a t

before u m restricts the domain of m to the characters whose code points are smaller than u.

#
val mem : UChar.t -> 'a t -> bool
#
val iter : (UChar.t -> 'a -> unit) -> 'a t -> unit
#
val iter_range : (UChar.t -> UChar.t -> 'a -> unit) -> 'a t -> unit

iter proc m : For each contingent region u1-u2 that is mapped to a constant v, proc u1 u2 v is called. The order of call is determined by increasing order on u1.

map ?eq f m and mapi ?eq f m : Similar to map and mapi in stdlib Map, but if the map m' is returned, it is only guaranteed that eq (find u m') (f (find u m )) is true for map and eq (find u m') (f u (find u m )) is true for mapi. If eq is not specified, structural equality is used.

#
val map : ?eq:('b -> 'b -> bool) -> ('a -> 'b) -> 'a t -> 'b t
#
val mapi : ?eq:('b -> 'b -> bool) -> (UChar.t -> 'a -> 'b) -> 'a t -> 'b t
#
val fold : (UChar.t -> 'b -> 'a -> 'a) -> 'b t -> 'a -> 'a
#
val fold_range : (UChar.t -> UChar.t -> 'b -> 'a -> 'a) -> 'b t -> 'a -> 'a

fold_range f m x is equivalent to f u_(2n) u_(2n+1) v_n (... (f u_1 u_2 v_1 x)) where all characters in the range u_(2k)-u_(2k+1) are mapped to v_k and u_1 < u_3 < ... in code point order. For each range u_(2k)-u_(2k+1) is separated by a character which is not mapped to v_k.

#
val set_to_map : USet.t -> 'a -> 'a t

Constant map.

#
val domain : 'a t -> USet.t

Domain.

#
val map_to_set : ('a -> bool) -> 'a t -> USet.t

map_to_set p m returns the set of characters which are mapped to values satisfying the predicate p by m.

#
val umap_of_imap : 'a IMap.t -> 'a t
#
val imap_of_umap : 'a t -> 'a IMap.t
end
#
module UCharTbl : sig

Fast lookup tables for Unicode. Accessible by constant time.

#
type 'a tbl

Fast lookup tables. Accessible by constant time.

#
type 'a t = 'a tbl
#
val get : 'a tbl -> UChar.t -> 'a
#
module type Type = sig
#
type elt
#
type t = elt tbl
#
val get : elt tbl -> UChar.t -> elt
#
val of_map : elt -> elt UMap.t -> t

of_map def m creates the table which has the same value to m. The table returns def for the characters for which m is undefined.

end
#
module Make : functor (H : Hashtbl.HashedType) -> Type with type elt = H.t

Equality and hash are necessary for table generation.

#
module Bool : sig

Tables for boolean values.

#
type t
#
val get : t -> UChar.t -> bool
#
val of_set : USet.t -> t
end
#
module Bits : sig

Tables for small (< 256, >=0) integers

#
type t
#
val of_map : int -> int UMap.t -> t
#
val get : t -> UChar.t -> int
end
#
module Bytes : sig

Tables for integers. If integers are not span the whole 31-bit or 63-bit values, Bytes.t is more space efficient than int tbl.

#
type t
#
val of_map : int -> int UMap.t -> t
#
val get : t -> UChar.t -> int
end
#
module Char : sig

Tables for bytes.

#
type t
#
val of_map : char -> char UMap.t -> t
#
val get : t -> UChar.t -> char
end
end
#
module UnicodeString : sig

Signature for Unicode strings. UText, XString, UTF8, UTF16, UCS4 have matched signatures to UStorage and satisfy the semantics described below. If users want to supply their own Unicode strings, please design the module with the following signature and properties.

#
module type Type = sig
#
type t

The type of string.

#
val get : t -> int -> UChar.t

get t i : i-th character of the storage.

#
val init : int -> (int -> UChar.t) -> t

init len f creates a new storage. the returned storage has length len, its nth-element is f n. f is called with integers 0 ... len - 1, only once for each integer. The call is in the increasing order f 0, f 1, f 2, ...

#
val length : t -> int

The number of Unicode characters in the storage

#
type index

locations in storages.

#
val look : t -> index -> UChar.t

look t i : The character in the location i of t.

#
val nth : t -> int -> index

nth t n : the location of the n-th character in t.

next x i, prev x i : The operation is valid if i points the valid element, i.e. the returned value may point the location beyond valid elements by one. If i does not point a valid element, the results are unspecified.

#
val next : t -> index -> index
#
val prev : t -> index -> index
#
val out_of_range : t -> index -> bool
#
val iter : (UChar.t -> unit) -> t -> unit
#
val compare : t -> t -> int
#
val first : t -> index

The location of the first character in the storage.

#
val last : t -> index

The location of the last character in the storage.

#
val move : t -> index -> int -> index

move t i n : if n >= 0, then returns n-th character after i and otherwise returns -n-th character before i. If there is no such character, or i does not point a valid character, the result is unspecified.

#
val compare_index : t -> index -> index -> int

compare_index t i j returns a positive integer if i is the location placed after j in t, 0 if i and j point the same location, and a negative integer if i is the location placed before j in t.

#
module Buf : sig

Character buffers. Similar to Buffer.

#
type buf
#
val create : int -> buf

create n creates the buffer. n is used to determine the initial size of the buffer. The meaning of n differs from modules to modules.

#
val contents : buf -> t
#
val clear : buf -> unit
#
val reset : buf -> unit
#
val add_char : buf -> UChar.t -> unit
#
val add_string : buf -> t -> unit
#
val add_buffer : buf -> buf -> unit
end
end
end
#
module UText : sig

An implementation of Unicode string.

An implementation of Unicode string. Internally, it uses integer array. The semantics matches the description of UStorage.

#
type mutability = [
| `Mutable
| `Immutable
]

Phantom type for distinguishing mutability

#
type 'a text
#
type utext = [
| `Immutable
] text
#
type ustring = [
| `Mutable
] text
#
type t = utext
#
val utext_of_ustring : ustring -> utext
#
val ustring_of_utext : utext -> ustring
#
val get : 'a text -> int -> UChar.t
#
val set : ustring -> int -> UChar.t -> unit

set s i u sets the i-th character in s to u.

#
type index
#
val look : 'a text -> index -> UChar.t
#
val nth : 'a text -> int -> index
#
val first : 'a text -> index
#
val last : 'a text -> index
#
val out_of_range : 'a text -> index -> bool
#
val compare_index : 'a text -> index -> index -> int
#
val next : 'a text -> index -> index
#
val prev : 'a text -> index -> index
#
val move : 'a text -> index -> int -> index
#
val length : 'a text -> int
#
val of_string : string -> utext

Conversion from Latin-1 strings.

#
val init : int -> (int -> UChar.t) -> utext
#
val init_ustring : int -> (int -> UChar.t) -> ustring
#
val make : int -> UChar.t -> ustring

The semantics of these function are similar to the equivalents of string.

#
val copy : ustring -> ustring
#
val sub : 'a text -> int -> int -> 'a text
#
val fill : ustring -> int -> int -> UChar.t -> unit
#
val blit : 'a text -> int -> ustring -> int -> int -> unit
#
val append : 'a text -> 'b text -> 'a text
#
val iter : (UChar.t -> unit) -> 'a text -> unit
#
val compare : 'a text -> 'b text -> int
#
module Buf : sig
#
type buf
#
val create : int -> buf

create n creates the buffer which initially can contain n Unicode characters.

#
val contents : buf -> t
#
val contents_string : buf -> ustring
#
val length : buf -> int
#
val clear : buf -> unit
#
val reset : buf -> unit
#
val add_char : buf -> UChar.t -> unit
#
val add_string : buf -> 'a text -> unit
#
val add_buffer : buf -> buf -> unit
end
end
#
module XString : sig

eXtensible Unicode string. The semantics matches the description of UStorage. The detail may be going to change.

#
type xstring
#
type t = xstring
#
val get : xstring -> int -> UChar.t
#
val set : xstring -> int -> UChar.t -> unit
#
val length : xstring -> int
#
val init : int -> (int -> UChar.t) -> xstring
#
type index
#
val look : xstring -> index -> UChar.t
#
val nth : xstring -> int -> index
#
val first : xstring -> index
#
val last : xstring -> index
#
val out_of_range : xstring -> index -> bool
#
val next : xstring -> index -> index
#
val prev : xstring -> index -> index
#
val move : xstring -> index -> int -> index
#
val compare_index : xstring -> index -> index -> int
#
val make : ?bufsize:int -> int -> UChar.t -> xstring
#
val clear : xstring -> unit
#
val reset : xstring -> unit
#
val copy : xstring -> xstring
#
val sub : xstring -> int -> int -> xstring
#
val add_char : xstring -> UChar.t -> unit
#
val add_text : xstring -> 'a UText.text -> unit
#
val add_xstring : xstring -> xstring -> unit
#
val shrink : xstring -> int -> unit
#
val append : xstring -> xstring -> xstring
#
val utext_of : xstring -> UText.t
#
val ustring_of : xstring -> UText.ustring
#
val iter : (UChar.t -> unit) -> xstring -> unit
#
val compare : t -> t -> int
#
module Buf : sig
#
type buf
#
val create : int -> buf
#
val contents : buf -> t
#
val length : buf -> int
#
val clear : buf -> unit
#
val reset : buf -> unit
#
val add_char : buf -> UChar.t -> unit
#
val add_string : buf -> t -> unit
#
val add_buffer : buf -> buf -> unit
end
end
#
module SubText : sig
#
module type Type = sig

Sub-texts, parts of original (ur-) texts. The signature and semantics matches those of UStorage.

#
type t
#
val get : t -> int -> UChar.t
#
val init : int -> (int -> UChar.t) -> t
#
val length : t -> int
#
type index
#
val look : t -> index -> UChar.t
#
val nth : t -> int -> index
#
val first : t -> index
#
val last : t -> index
#
val next : t -> index -> index
#
val prev : t -> index -> index
#
val move : t -> index -> int -> index
#
val out_of_range : t -> index -> bool
#
val compare_index : t -> index -> index -> int
#
val iter : (UChar.t -> unit) -> t -> unit
#
val compare : t -> t -> int
#
module Buf : sig
#
type buf
#
val create : int -> buf
#
val contents : buf -> t
#
val clear : buf -> unit
#
val reset : buf -> unit
#
val add_char : buf -> UChar.t -> unit
#
val add_string : buf -> t -> unit
#
val add_buffer : buf -> buf -> unit
end
#
type ur_text

The type of original texts.

#
type ur_index

The type of indexes of original texts.

#
val refer : ur_text -> ur_index -> ur_index -> t

refer t i j returns the part of t from i until j. The character pointed by j is not included in the result. If j is equal to i or located before j, the result is an empty string.

#
val excerpt : t -> ur_text

excerpt t copies the contents of t as a new ur_text.

#
val context : t -> ur_text * ur_index * ur_index

context t returns the tuple (s, i, j) such that t = refer s i j.

#
val ur_index_of : t -> index -> ur_index

Conversion from indexes of sub-texts to ur_texts.

end
#
module Make : functor (Text : UnicodeString.Type) -> Type with type ur_text = Text.t and type ur_index = Text.index
end
#
module ULine : sig

Line IO

Line I/O, conversion of line separators.

#
type separator = [
| `CR
| `LF
| `CRLF
| `NEL
| `LS
| `PS
]

Line separators.

  • `CR specifies carriage return.
  • `LF specifies linefeed.
  • `CRLF specifies the sequence of carriage return and linefeed.
  • `NEL specifies next line (\u0085).
  • `LS specifies Unicode line separator (\u2028).
  • `PS specifies Unicode paragraph separator (\u2029).
#
class input : separator -> UChar.t #OOChannel.#obj_input_channel -> [UChar.t] OOChannel.obj_input_channel

new input separator input_obj creates the new input channel object OOChannel.obj_input_channel which reads from input_obj and converts line separators (all of CR, LF, CRLF, NEL, LS, PS) to separator.

#
class output : separator -> UChar.t #OOChannel.#obj_output_channel -> [UChar.t] OOChannel.obj_output_channel

new output separator output_obj creates the new output channel object OOChannel.obj_output_channel which receives Unicode characters and converts line separators (all of CR, LF, CRLF, NEL, LS, PS) to separator.

#
module type Type = sig
#
type text
#
class input_line : UChar.t #OOChannel.#obj_input_channel -> [text] OOChannel.obj_input_channel

new input_line input_obj creates the new input channel object OOChannel.obj_input_channel which reads Unicode characters from input_obj and output lines. All of CR, LF, CRLF, NEL, LS, PS, as well as FF (formfeed) are recognised as a line separator.

#
class output_line : ?sp:separator option -> UChar.t #OOChannel.#obj_output_channel -> [text] OOChannel.obj_output_channel

new output_line ~sp output_obj create the new output channel object OOChannel.obj_output_channel which output each line to output_obj using sp as a line separator. If sp is omitted, linefeed (LF) is used.

end
#
module Make : functor (Text : UnicodeString.Type) -> Type with type text = Text.t
end
#
module Locale : sig

Camomile has a locale system similar to Java. A locale is a string with a form as "<LANG>_<COUNTRY>_<MODIFIER>..." where <LANG> is a 2-letter ISO 639 language code, <COUNTRY> is a 2-letter ISO 3166 country code. Some field may not present.

#
type t = string

Type of locales.

#
val read : string -> string -> (Pervasives.in_channel -> 'a) -> string -> 'a

read root suffix reader locale reads locale information using reader. Locale data is supposed to reside in root directory with the name locale.suffix. reader takes in_channel as an argument and read data from in_channel. If data is not found, then reader should raise Not_found. If the file is not found or reader raises Not_found, then more generic locales are tried. For example, if fr_CA.suffix is not found, then read tries fr.suffix. If fr.suffix is also not found, then the file root.suffix is tried. Still the data is not found, then Not_found is raised.

#
val contain : string -> string -> bool

contain loc1 loc2 : If loc1 is contained in loc2 then true otherwise false. For example, "fr" is contained in "fr_CA" while "en_CA" does not contain "fr"

end
#
module UTF8 : sig

UTF-8 encoded Unicode strings. The type is normal string.

#
type t = string

UTF-8 encoded Unicode strings. The type is normal string.

#
exception Malformed_code
#
val validate : t -> unit

validate s successes if s is valid UTF-8, otherwise raises Malformed_code. Other functions assume strings are valid UTF-8, so it is prudent to test their validity for strings from untrusted origins.

#
val get : t -> int -> UChar.t

get s n returns n-th Unicode character of s. The call requires O(n)-time.

#
val init : int -> (int -> UChar.t) -> t

init len f returns a new string which contains len Unicode characters. The i-th Unicode character is initialized by f i

#
val length : t -> int

length s returns the number of Unicode characters contained in s

#
type index = int

Positions in the string represented by the number of bytes from the head. The location of the first character is 0

#
val nth : t -> int -> index

nth s n returns the position of the n-th Unicode character. The call requires O(n)-time

#
val first : t -> index

The position of the head of the first Unicode character.

#
val last : t -> index

The position of the head of the last Unicode character.

#
val look : t -> index -> UChar.t

look s i returns the Unicode character of the location i in the string s.

#
val out_of_range : t -> index -> bool

out_of_range s i tests whether i is a position inside of s.

#
val compare_index : t -> index -> index -> int

compare_index s i1 i2 returns a value < 0 if i1 is the position located before i2, 0 if i1 and i2 points the same location, a value > 0 if i1 is the position located after i2.

#
val next : t -> index -> index

next s i returns the position of the head of the Unicode character located immediately after i. If i is inside of s, the function always successes. If i is inside of s and there is no Unicode character after i, the position outside s is returned. If i is not inside of s, the behaviour is unspecified.

#
val prev : t -> index -> index

prev s i returns the position of the head of the Unicode character located immediately before i. If i is inside of s, the function always successes. If i is inside of s and there is no Unicode character before i, the position outside s is returned. If i is not inside of s, the behaviour is unspecified.

#
val move : t -> index -> int -> index

move s i n returns n-th Unicode character after i if n >= 0, n-th Unicode character before i if n < 0. If there is no such character, the result is unspecified.

#
val iter : (UChar.t -> unit) -> t -> unit

iter f s applies f to all Unicode characters in s. The order of application is same to the order of the Unicode characters in s.

#
val compare : t -> t -> int

Code point comparison by the lexicographic order. compare s1 s2 returns a positive integer if s1 > s2, 0 if s1 = s2, a negative integer if s1 < s2.

#
module Buf : sig

Buffer module for UTF-8 strings

#
type buf

Buffers for UTF-8 strings.

#
val create : int -> buf

create n creates the buffer with the initial size n-bytes.

#
val contents : buf -> t

contents buf returns the contents of the buffer.

#
val clear : buf -> unit

Empty the buffer, but retains the internal storage which was holding the contents

#
val reset : buf -> unit

Empty the buffer and de-allocate the internal storage.

#
val add_char : buf -> UChar.t -> unit

Add one Unicode character to the buffer.

#
val add_string : buf -> t -> unit

Add the UTF-8 string to the buffer.

#
val add_buffer : buf -> buf -> unit

add_buffer b1 b2 adds the contents of b2 to b1. The contents of b2 is not changed.

end
with type buf = Buffer.t
end
#
module UTF16 : sig
#
type t = (int, Bigarray.int16_unsigned_elt, Bigarray.c_layout) Bigarray.Array1.t

UTF-16 encoded string. the type is the bigarray of 16-bit integers. The characters must be 21-bits code points, and not surrogate points, 0xfffe, 0xffff. Bigarray.cma or Bigarray.cmxa must be linked when this module is used.

#
exception Malformed_code
#
val validate : t -> unit

validate s If s is valid UTF-16 then successes otherwise raises Malformed_code. Other functions assume strings are valid UTF-16, so it is prudent to test their validity for strings from untrusted origins.

All functions below assume strings are valid UTF-16. If not, the result is unspecified.

#
val get : t -> int -> UChar.t

get s n returns n-th Unicode character of s. The call requires O(n)-time.

#
exception Out_of_range
#
val init : int -> (int -> UChar.t) -> t

init len f returns a new string which contains len Unicode characters. The i-th Unicode character is initialized by f i if the character is not representable, raise Out_of_range.

#
val length : t -> int

length s returns the number of Unicode characters contained in s

#
type index = int

Positions in the string represented by the number of 16-bit unit from the head. The location of the first character is 0

#
val nth : t -> int -> index

nth s n returns the position of the n-th Unicode character. The call requires O(n)-time

#
val first : t -> index

first s : The position of the head of the last Unicode character.

#
val last : t -> index

last s : The position of the head of the last Unicode character.

#
val look : t -> index -> UChar.t

look s i returns the Unicode character of the location i in the string s.

#
val out_of_range : t -> index -> bool

out_of_range s i tests whether i is inside of s.

#
val compare_index : t -> index -> index -> int

compare_aux s i1 i2 returns

  • If i1 is the position located before i2, a value < 0,
  • If i1 and i2 points the same location, 0,
  • If i1 is the position located after i2, a value > 0.
#
val next : t -> index -> index

next s i returns the position of the head of the Unicode character located immediately after i.

  • If i is a valid position, the function always success.
  • If i is a valid position and there is no Unicode character after i, the position outside s is returned.
  • If i is not a valid position, the behaviour is undefined.
#
val prev : t -> index -> index

prev s i returns the position of the head of the Unicode character located immediately before i.

  • If i is a valid position, the function always success.
  • If i is a valid position and there is no Unicode character before i, the position outside s is returned.
  • If i is not a valid position, the behaviour is undefined.
#
val move : t -> index -> int -> index
#
val iter : (UChar.t -> unit) -> t -> unit

iter f s Apply f to all Unicode characters in s. The order of application is same to the order in the Unicode characters in s.

#
val compare : t -> t -> int

Code point comparison

#
module Buf : sig

Buffer module for UTF-16

#
type buf
#
val create : int -> buf

create n : creates the buffer with the initial size n.

The rest of functions is similar to the ones of Buffer in stdlib.

#
val contents : buf -> t
#
val clear : buf -> unit
#
val reset : buf -> unit
#
val add_char : buf -> UChar.t -> unit

if the character is not representable, raise Out_of_range

#
val add_string : buf -> t -> unit
#
val add_buffer : buf -> buf -> unit
end
end
#
module UCS4 : sig

UCS4 encoded string. The type is the bigarray of 32-bit integers. Bigarray.cma or Bigarray.cmxa must be linked when this module is used.

#
exception Malformed_code
#
val validate : t -> unit

validate s If s is valid UCS4 then successes otherwise raises Malformed_code. Other functions assume strings are valid UCS4, so it is prudent to test their validity for strings from untrusted origins.

All functions below assume strings are valid UCS4. If not, the result is unspecified.

#
val get : t -> int -> UChar.t

get s n returns n-th Unicode character of s.

#
val init : int -> (int -> UChar.t) -> t

init len f returns a new string which contains len Unicode characters. The i-th Unicode character is initialised by f i

#
val length : t -> int

length s returns the number of Unicode characters contained in s

#
type index = int

Positions in the string represented by the number of characters from the head. The location of the first character is 0

#
val nth : t -> int -> index

nth s n returns the position of the n-th Unicode character. The call requires O(n)-time

#
val first : t -> index

first s : The position of the head of the last Unicode character.

#
val last : t -> index

last s : The position of the head of the last Unicode character.

#
val look : t -> index -> UChar.t

look s i returns the Unicode character of the location i in the string s.

#
val out_of_range : t -> index -> bool

out_of_range s i tests whether i points the valid position of s.

#
val compare_index : t -> index -> index -> int

compare_aux s i1 i2 returns If i1 is the position located before i2, a value < 0, If i1 and i2 points the same location, 0, If i1 is the position located after i2, a value > 0.

#
val next : t -> index -> index

next s i returns the position of the head of the Unicode character located immediately after i. If i is a valid position, the function always success. If i is a valid position and there is no Unicode character after i, the position outside s is returned. If i is not a valid position, the behaviour is undefined.

#
val prev : t -> index -> index

prev s i returns the position of the head of the Unicode character located immediately before i. If i is a valid position, the function always success. If i is a valid position and there is no Unicode character before i, the position outside s is returned. If i is not a valid position, the behaviour is undefined.

#
val move : t -> index -> int -> index

move s i n : If n >= 0, returns n-th Unicode character after i. If n < 0, returns -n-th Unicode character before i. If there is no such character, the result is unspecified.

#
val iter : (UChar.t -> unit) -> t -> unit

iter f s : Apply f to all Unicode characters in s. The order of application is same to the order in the Unicode characters in s.

#
val compare : t -> t -> int

Code point comparison

#
module Buf : sig

Buffer module for UCS4

#
type buf

create n creates the buffer with the initial size n.

#
val create : int -> buf

The rest of functions is similar to the ones of Buffer in stdlib.

#
val contents : buf -> t
#
val clear : buf -> unit
#
val reset : buf -> unit
#
val add_char : buf -> UChar.t -> unit
#
val add_string : buf -> t -> unit
#
val add_buffer : buf -> buf -> unit
end
end
#
module UPervasives : sig

Functions for toplevel

#
type uchar = UChar.t

Aliases for UChar.uint_code, UChar.chr_of_uint

#
val int_of_uchar : uchar -> int
#
val uchar_of_int : int -> uchar
#
val escaped_uchar : uchar -> string
#
val escaped_utf8 : string -> string
#
val printer_utf8 : Format.formatter -> string -> unit
#
val printer_uchar : Format.formatter -> uchar -> unit
end
#
module URe : sig

Regular expression engine.

#
type regexp = [
| `Alt of regexp * regexp
| `Seq of regexp * regexp
| `Rep of regexp
| `Repn of regexp * int * int option
| `After of regexp
| `Before of regexp
| `Epsilon
| `Group of regexp
| `OneChar
| `String of UChar.t list
| `Set of USet.t
| `BoS
| `EoS
]

Abstract syntax trees of regular expressions.

#
type match_semantics = [
| `First
| `Shortest
| `Longest
]

Match semantics.

#
val no_group : regexp -> regexp

Remove `Group from the regular expressions.

#
module type Type = sig
#
type text
#
type index
#
type compiled_regexp
#
module SubText : SubText.Type with type ur_text = text and type ur_index = index
#
val compile : regexp -> compiled_regexp

Compile regular expressions.

#
val regexp_match : ?sem:match_semantics -> compiled_regexp -> text -> index -> SubText.t option array option

regexp_match ?sem r t i tries matching r and substrings of t beginning from i. If match successes, Some g is returned where g is the array containing the matched string of n-th group in the n-element. The matched string of the whole r is stored in the 0-th element. If matching fails, None is returned.

#
val string_match : compiled_regexp -> text -> index -> bool

string_match r t i tests whether r can match a substring of t beginning from i.

#
val search_forward : ?sem:match_semantics -> compiled_regexp -> text -> index -> SubText.t option array option

search_forward ?sem r t i searches a substring of t matching r from i. The returned value is similar to URe.Type.regexp_match.

end
#
module Make : functor (Text : UnicodeString.Type) -> Type with type text = Text.t and type index = Text.index
end
#
module CharEncoding : sig
#
module type Interface = sig

Module for character encodings.

#
exception Malformed_code

Failure of decoding

#
exception Out_of_range

Failure of encoding

#
type t

Type for encodings.

#
val automatic : string -> t list -> t -> t

automatic name [enc_1; enc_2; ... enc_n] enc creates the new encoding name doing automatic encoding detection among enc_1, enc_2, ..., enc_n by the given order. enc is used for encoding.

#
val new_enc : string -> t -> unit

new_enc name enc registers the new encoding enc under the name name

#
val alias : string -> string -> unit

alias alias name : Define alias as an alias of the encoding with the name name.

#
val of_name : string -> t

Returns the encoding of the given name. Fails if the encoding is unknown. Encoding names are the same to codeset names in charmap files for the encodings defined by charmap. See charmaps directory in the source directory for the available encodings. In addition to the encodings via the charmap files, camomile supports ISO-2022-CN, ISO-2022-JP, ISO-2022-JP-2, ISO-2022-KR, jauto (Auto detection of Japanese encodings), UTF-8, UTF-16, UTF-16BE, UTF-16LE. UTF-32, UTF-32BE, UTF-32LE, UCS-4(Big endian order). The encoding also can be referred by "IANA/<IANA name>", if the encoding is supported.

#
val name_of : t -> string

Returns the name of the encoding.

Shortcuts

#
val ascii : t
#
val latin1 : t
#
val utf8 : t
#
val utf16 : t
#
val utf16be : t
#
val utf16le : t
#
val utf32 : t
#
val utf32be : t
#
val utf32le : t
#
val ucs4 : t
#
val recode_string : in_enc:t -> out_enc:t -> string -> string

recode_string ~in_enc ~out_enc s converts the string s from in_enc to out_enc.

#
class uchar_input_channel_of : t -> OOChannel.char_input_channel -> [UChar.t] OOChannel.obj_input_channel

new uchar_input_channel_of enc c_in creates the new intput channel which convert characters to Unicode using encoding enc.

#
class uchar_output_channel_of : t -> OOChannel.char_output_channel -> [UChar.t] OOChannel.obj_output_channel

new uchar_ouput_channel_of enc c_out creates the new output channel which convert Unicode to its byte representation using encoding enc.

#
class convert_uchar_input : t -> UChar.t OOChannel.obj_input_channel -> OOChannel.char_input_channel

new convert_uchar_input enc c_in creates the new channel which convert Unicode input to its byte representation using encoding enc.

#
class convert_uchar_output : t -> UChar.t OOChannel.obj_output_channel -> OOChannel.char_output_channel

new convert_uchar_output enc c_in creates the new channel which convert character output to Unicode using encoding enc.

#
class convert_input : in_enc:t -> out_enc:t -> OOChannel.char_input_channel -> OOChannel.char_input_channel

new convert_input in_enc out_enc c_in create the new input channel using encoding out_enc from the input channel using encoding in_enc

#
class convert_output : in_enc:t -> out_enc:t -> OOChannel.char_output_channel -> OOChannel.char_output_channel

new convert_ouput in_enc out_enc c_in create the new output channel using encoding in_enc from the output channel using encoding out_enc

#
class out_channel : t -> Pervasives.out_channel -> [UChar.t] OOChannel.obj_output_channel

new out_channel enc outchan creates the output channel object OOChannel.obj_output_channel which receives Unicode characters and outputs them to outchan using the encoding enc.

#
class in_channel : t -> Pervasives.in_channel -> [UChar.t] OOChannel.obj_input_channel

new in_channel enc inchan creates the intput channel object OOChannel.obj_input_channel which reads bytes from inchan and converts them to Unicode characters.

#
val ustream_of : t -> char Stream.t -> UChar.t Stream.t

ustream_of enc chars converts the byte stream chars to the Unicode character stream by the encoding enc.

#
val char_stream_of : t -> UChar.t Stream.t -> char Stream.t

char_stream_of enc uchars converts the Unicode character stream uchars to the byte stream by the encoding enc

#
module type Type = sig
#
type text
#
val decode : t -> string -> text

decode enc s converts the string s encoded by the encoding enc to the Unicode text.

#
val encode : t -> text -> string

encode enc t converts the Unicode text t to the string by the encoding enc.

end
#
module Make : functor (Text : UnicodeString.Type) -> Type with type text = Text.t
end
#
module Configure : functor (Config : ConfigInt.Type) -> Interface
end
#
module UCharInfo : sig

Unicode character informations

#
module type Type = sig

Character Information

#
type general_category_type = [
| `Lu
| `Ll
| `Lt
| `Mn
| `Mc
| `Me
| `Nd
| `Nl
| `No
| `Zs
| `Zl
| `Zp
| `Cc
| `Cf
| `Cs
| `Co
| `Cn
| `Lm
| `Lo
| `Pc
| `Pd
| `Ps
| `Pe
| `Pi
| `Pf
| `Po
| `Sm
| `Sc
| `Sk
| `So
]

Type of Unicode general character categories. Each variant specifies

  • `Lu : Letter, Uppercase
  • `Ll : Letter, Lowercase
  • `Lt : Letter, Titlecase
  • `Mn : Mark, Non-Spacing
  • `Mc : Mark, Spacing Combining
  • `Me : Mark, Enclosing
  • `Nd : Number, Decimal Digit
  • `Nl : Number, Letter
  • `No : Number, Other
  • `Zs : Separator, Space
  • `Zl : Separator, Line
  • `Zp : Separator, Paragraph
  • `Cc : Other, Control
  • `Cf : Other, Format
  • `Cs : Other, Surrogate
  • `Co : Other, Private Use
  • `Cn : Other, Not Assigned
  • `Lm : Letter, Modifier
  • `Lo : Letter, Other
  • `Pc : Punctuation, Connector
  • `Pd : Punctuation, Dash
  • `Ps : Punctuation, Open
  • `Pe : Punctuation, Close
  • `Pi : Punctuation, Initial
  • `Pf : Punctuation, Final
  • `Po : Punctuation, Other
  • `Sm : Symbol, Math
  • `Sc : Symbol, Currency
  • `Sk : Symbol, Modifier
  • `So : Symbol, Other
#
val general_category : UChar.t -> general_category_type
#
val load_general_category_map : unit -> general_category_type UMap.t
#
type character_property_type = [
| `Math
| `Alphabetic
| `Lowercase
| `Uppercase
| `ID_Start
| `ID_Continue
| `XID_Start
| `XID_Continue
| `Default_Ignorable_Code_Point
| `Grapheme_Extend
| `Grapheme_Base
| `Bidi_Control
| `White_Space
| `Hyphen
| `Quotation_Mark
| `Terminal_Punctuation
| `Other_Math
| `Hex_Digit
| `Ascii_Hex_Digit
| `Other_Alphabetic
| `Ideographic
| `Diacritic
| `Extender
| `Other_Lowercase
| `Other_Uppercase
| `Noncharacter_Code_Point
| `Other_Grapheme_Extend
| `Grapheme_Link
| `IDS_Binary_Operator
| `IDS_Trinary_Operator
| `Radical
| `Unified_Ideograph
| `Other_default_Ignorable_Code_Point
| `Deprecated
| `Soft_Dotted
| `Logical_Order_Exception
]

Type of character properties

#
val load_property_tbl : character_property_type -> UCharTbl.Bool.t

Load the table for the given character type.

#
val load_property_tbl_by_name : string -> UCharTbl.Bool.t

Load the table for the given name of the character type. The name can be obtained by removing ` from its name of the polymorphic variant tag.

#
val load_property_set : character_property_type -> USet.t

Load the set of characters of the given character type.

#
val load_property_set_by_name : string -> USet.t

Load the set of characters of the given name of the character type. The name can be obtained by removing ` from its name of the polymorphic variant tag.

#
type script_type = [
| `Common
| `Inherited
| `Latin
| `Greek
| `Cyrillic
| `Armenian
| `Hebrew
| `Arabic
| `Syriac
| `Thaana
| `Devanagari
| `Bengali
| `Gurmukhi
| `Gujarati
| `Oriya
| `Tamil
| `Telugu
| `Kannada
| `Malayalam
| `Sinhala
| `Thai
| `Lao
| `Tibetan
| `Myanmar
| `Georgian
| `Hangul
| `Ethiopic
| `Cherokee
| `Canadian_Aboriginal
| `Ogham
| `Runic
| `Khmer
| `Mongolian
| `Hiragana
| `Katakana
| `Bopomofo
| `Han
| `Yi
| `Old_Italic
| `Gothic
| `Deseret
| `Tagalog
| `Hanunoo
| `Buhid
| `Tagbanwa
]

Type for script type

#
val script : UChar.t -> script_type
#
val load_script_map : unit -> script_type UMap.t
#
type version_type = [
| `Nc
| `v1_0
| `v1_1
| `v2_0
| `v2_1
| `v3_0
| `v3_1
| `v3_2
]

age

#
val age : UChar.t -> version_type

age c unicode version in wich c was introduced

older v1 v2 is true if v1 is older ( or the same version ) than v2. Everithing is older than `Nc

#
val older : version_type -> version_type -> bool

casing

#
val load_to_lower1_tbl : unit -> UChar.t UCharTbl.t
#
val load_to_upper1_tbl : unit -> UChar.t UCharTbl.t
#
val load_to_title1_tbl : unit -> UChar.t UCharTbl.t
#
type casemap_condition = [
| `Locale of string
| `FinalSigma
| `AfterSoftDotted
| `MoreAbove
| `BeforeDot
]
#
type special_casing_property = {
# lower
: UChar.t list;
# title
: UChar.t list;
# upper
: UChar.t list;
# condition
: casemap_condition list;
}
#
val load_conditional_casing_tbl : unit -> special_casing_property list UCharTbl.t
#
val load_casefolding_tbl : unit -> UChar.t list UCharTbl.t
#
val combined_class : UChar.t -> int

Combined class A combined class is an integer of 0 -- 255, showing how this character interacts to other combined characters.

Decomposition

#
type decomposition_type = [
| `Canon
| `Font
| `NoBreak
| `Initial
| `Medial
| `Final
| `Isolated
| `Circle
| `Super
| `Sub
| `Vertical
| `Wide
| `Narrow
| `Small
| `Square
| `Fraction
| `Compat
]

Types of decomposition.

#
type decomposition_info = [
| `Canonform
| `HangulSyllable
| `Composite of decomposition_type * UChar.t list
]
#
val load_decomposition_tbl : unit -> decomposition_info UCharTbl.t

Canonical Composition

#
val load_composition_tbl : unit -> (UChar.t * UChar.t) list UCharTbl.t

The return value [(u_1, u'_1); ... (u_n, u'_1)] means for the given character u, u u_i forms the canonical composition u'_i. If u is a Hangul jamo, composition returns .

#
val load_composition_exclusion_tbl : unit -> UCharTbl.Bool.t

Whether the given composed character is used in NFC or NFKC

end
#
module Make : functor (Config : ConfigInt.Type) -> Type
end
#
module UNF : sig

Unicode normal form (NFD, NFKD, NFC, NFKC) as described in UTR #15

#
module type Type = sig
#
type text
#
class nfd : UChar.t #OOChannel.#obj_output_channel -> [UChar.t] OOChannel.obj_output_channel
#
class nfc : UChar.t #OOChannel.#obj_output_channel -> [UChar.t] OOChannel.obj_output_channel
#
class nfkd : UChar.t #OOChannel.#obj_output_channel -> [UChar.t] OOChannel.obj_output_channel
#
class nfkc : UChar.t #OOChannel.#obj_output_channel -> [UChar.t] OOChannel.obj_output_channel

Conversion to NFD, NFKD, NFC, NFKC forms.

#
val nfd : text -> text
#
val nfkd : text -> text
#
val nfc : text -> text
#
val nfkc : text -> text
#
module NFCBuf : sig
#
type buf
#
val create : int -> buf
#
val contents : buf -> text
#
val clear : buf -> unit
#
val reset : buf -> unit
#
val add_char : buf -> UChar.t -> unit
#
val add_string : buf -> text -> unit
#
val add_buffer : buf -> buf -> unit
end
#
val nfc_append : text -> text -> text

put_nfd b t, put_nfkd b t, put_nfc b t, put_nfkc b t clear the contents of b and put the NFD, NFKD, NFC, NFKC forms of t into b respectively.

#
val put_nfd : XString.t -> text -> unit
#
val put_nfkd : XString.t -> text -> unit
#
val put_nfc : XString.t -> text -> unit
#
val put_nfkc : XString.t -> text -> unit
#
type index
#
val nfd_inc : text -> index -> ([
| `Inc of UChar.t list * index * 'a lazy_t
] as 'a)
#
val canon_compare : text -> text -> int
#
val nfd_decompose : UChar.t -> UChar.t list
#
val nfkd_decompose : UChar.t -> UChar.t list
end
#
module Make : functor (Config : ConfigInt.Type) -> functor (Text : UnicodeString.Type) -> Type with type text = Text.t and type index = Text.index
end
#
module UCol : sig

Unicode collation algorithm

String comparison by collation as described in UTR #10

#
type variable_option = [
| `Blanked
| `Non_ignorable
| `Shifted
| `Shift_Trimmed
]

How variables are handled

#
type precision = [
| `Primary
| `Secondary
| `Tertiary
| `Quaternary
]

Strength of comparison. For European languages, each strength roughly means as `Primary : Ignore accents and case `Secondary : Ignore case but accents are counted in. `Tertiary : Accents and case are counted in. For the case of `Shifted, `Shift_Trimmed, there is the fourth strength. `Quaternary : Variables such as - (hyphen) are counted in.

#
module type Type = sig
#
type text
#
type index
#
val compare : ?locale:string -> ?prec:precision -> ?variable:variable_option -> text -> text -> int

For locale, see Locale. If locale is omitted, the standard UCA order is used. If prec is omitted, the maximum possible strength is used. If variable is omitted, the default of the locale (usually `Shifted) is used. The meaning of the returned value is similar to Pervasives.compare

#
val sort_key : ?locale:string -> ?prec:precision -> ?variable:variable_option -> text -> string

Binary comparison of sort_key gives the same result as compare. i.e. compare t1 t2 = Pervasives.compare (sort_key t1) (sort_key t2) If the same texts are repeatedly compared, pre-computation of sort_key gives better performance.

#
val compare_with_key : ?locale:string -> ?prec:precision -> ?variable:variable_option -> string -> text -> int

Comparison with the sort key.

#
val search_with_key : ?locale:string -> ?prec:precision -> ?variable:variable_option -> string -> text -> index -> index * index
#
val search : ?locale:string -> ?prec:precision -> ?variable:variable_option -> text -> text -> index -> index * index
end
#
module Make : functor (Config : ConfigInt.Type) -> functor (Text : UnicodeString.Type) -> Type with type text = Text.t and type index = Text.index
end
#
module CaseMap : sig

Case mappings as defined in Unicode Technical Report #21

For locale, see Locale. If locale is omitted, default mapping is used.

#
module type Type = sig
#
type text
#
val lowercase : ?locale:string -> text -> text
#
val uppercase : ?locale:string -> text -> text
#
val titlecase : ?locale:string -> text -> text

Capitalize the beginning of words

#
val casefolding : text -> text

Case foldding

#
val compare_caseless : text -> text -> int

Caseless comparison

end
#
module Make : functor (Config : ConfigInt.Type) -> functor (Text : UnicodeString.Type) -> Type with type text = Text.t
end
#
module UReStr : sig

Module for a Str-like regular expression syntax. The difference can be summarised as follows.

  • Non-ASCII characters can be used if encoded by UTF-8, or using the escape syntax \u<code number as hex digits>.
  • Each Unicode character is treated as a single character.
  • Character properties like Lu (UCharInfo.general_category_type), White_Space (UCharInfo.character_property_type), Ogham (UCharInfo.script_type) can be used in character sets. e.g. [{Lu & ID_Start}][{ID_Continue}]* (capitalised identifier), \([{Han}]+\|[{Katakana}]+\)[{Hiragana}]* (Japanese word component). Boolean notations as | (or) :, & (and) - (set subtraction) can be used in {...} notations. Any is used to denote the set of all characters in {...} notations.
#
module type Interface = sig
#
type regexp = URe.regexp

Theses functions are similar to Str.

#
val regexp : string -> regexp
#
val quote : string -> string
#
val regexp_string : string -> regexp
#
module type Type = sig
#
type text
#
type index
#
type compiled_regexp
#
module SubText : SubText.Type with type ur_text = text and type ur_index = index
#
val compile : regexp -> compiled_regexp

Compile regular expressions.

#
val regexp_match : ?sem:URe.match_semantics -> compiled_regexp -> text -> index -> SubText.t option array option

regexp_match ?sem r t i tries matching r and substrings of t beginning from i. If match successes, Some g is returned where g is the array containing the matched string of n-th group in the n-element. The matched string of the whole r is stored in the 0-th element. If matching fails, None is returned.

#
val string_match : compiled_regexp -> text -> index -> bool

string_match r t i tests whether r can match a substring of t beginning from i.

#
val search_forward : ?sem:URe.match_semantics -> compiled_regexp -> text -> index -> SubText.t option array option

search_forward ?sem r t i searches a substring of t matching r from i. The returned value is similar to URe.Type.regexp_match.

end
#
module Make : functor (Text : UnicodeString.Type) -> Type with type text = Text.t and type index = Text.index
end
#
module Configure : functor (Config : ConfigInt.Type) -> Interface
end
#
module StringPrep : sig
#
module type Type = sig
#
type text
#
exception Prohibited of UChar.t
#
exception Bad_bidi
#
type profile = [
| `Nameprep
| `Nodeprep
| `Resourceprep
| `Saslprep
| `Trace
| `Iscsi
| `Mib
]
#
val stringprep : profile -> text -> text
end
#
module Make : functor (Config : ConfigInt.Type) -> functor (Text : UnicodeString.Type) -> Type with type text = Text.t
end
#
module type Type = sig
#
module OOChannel : sig

Object Oriented Channel

#
class type ['a] obj_input_channel =
#
method close_in : unit -> unit
#
method get : unit -> 'a

Generic input channel Have the same interface of Polymorphic input channel of http://www.ocaml-programming.de/rec/IO-Classes.html All channels of Camomile having this interface must confirm the behaviour defined in the recommendation above.

#
class type ['a] obj_output_channel =
#
method close_out : unit -> unit

If close_oout cannot output all buffered objects, flush raises Failure

If flush cannot output all buffered objects, flush raises Failure

#
method flush : unit -> unit
#
method put : 'a -> unit

Generic output channel Have the same interface of Polymorphic output channel of http://www.ocaml-programming.de/rec/IO-Classes.html All channels of Camomile having this interface must confirm the behaviour defined in the recommendation above.

#
class ['a] channel_of_stream : 'a Stream.t -> ['a] obj_input_channel

Convert stream to obj_input_channel

#
val stream_of_channel : 'a #obj_input_channel -> 'a Stream.t

Convert obj_input_channel to stream

#
class type char_input_channel =
#
method input : string -> int -> int -> int
#
method close_in : unit -> unit

Character(byte) input channel. Have the same interface of octet input channel of http://www.ocaml-programming.de/rec/IO-Classes.html All channels of Camomile having this interface must confirm the behaviour defined in the recommendation above. In addition, all channels are assumed to be blocking. If you supply a non-blocking channel to Camomile API, the outcome is undefined.

#
class type char_output_channel =
#
method output : string -> int -> int -> int
#
method flush : unit -> unit
#
method close_out : unit -> unit

Character(byte) output channel. Have the same interface of octet input channel of http://www.ocaml-programming.de/rec/IO-Classes.html All channels of Camomile having this interface must confirm the behaviour defined in the recommendation above. In addition, all channels are assumed to be blocking. If you supply a non-blocking channel to Camomile API, the outcome is undefined.

#
class char_input_channel_of : char #obj_input_channel -> char_input_channel

Convert a polymorphic input channel to a character input channel

#
class char_obj_input_channel_of : char_input_channel -> [char] obj_input_channel

Convert a character input channel to a polymorphic input channel

#
class char_output_channel_of : char #obj_output_channel -> char_output_channel

Convert a polymorphic output channel to a character output channel

#
class char_obj_output_channel_of : char_output_channel -> [char] obj_output_channel

Convert a character output channel to a polymorphic output channel

#
class of_in_channel : Pervasives.in_channel -> char_input_channel

Convert an OCaml input channel to an OO-based character input channel

#
class of_out_channel : Pervasives.out_channel -> char_output_channel

Convert an OCaml output channel to an OO-based character output channel

end
#
module UChar : sig

Unicode (ISO-UCS) characters.

This module implements Unicode (actually ISO-UCS) characters. All 31-bit code points are allowed.

#
type t

Unicode characters. All 31bit code points are allowed.

#
exception Out_of_range
#
val char_of : t -> char

char_of u returns the Latin-1 representation of u. If u can not be represented by Latin-1, raises Out_of_range

#
val of_char : char -> t

of_char c returns the Unicode character of the Latin-1 character c

#
val code : t -> int

code u returns the Unicode code number of u. If the value can not be represented by a positive integer, raise Out_of_range

#
val chr : int -> t

code n returns the Unicode character with the code number n. If n >= 2^32 or n < 0, raises invalid_arg

#
external uint_code : t -> int = "%identity"

uint_code u returns the Unicode code number of u. The returned int is unsigned, that is, on 32-bits platforms, the sign bit is used for storing the 31-th bit of the code number.

#
val chr_of_uint : int -> t

chr_of_uint n returns the Unicode character of the code number n. n is interpreted as unsigned, that is, on 32-bits platforms, the sign bit is treated as the 31-th bit of the code number. If n exceed 31-bits values, then raise invalid_arg.

#
val eq : t -> t -> bool

Equality by code point comparison

#
val compare : t -> t -> int

compare u1 u2 returns, a value > 0 if u1 has a larger Unicode code number than u2, 0 if u1 and u2 are the same Unicode character, a value < 0 if u1 has a smaller Unicode code number than u2.

#
type uchar = t

Aliases of type t

#
val int_of : uchar -> int

Alias of uint_code

#
val of_int : int -> uchar

Alias of chr_of_uint

end
#
module USet : sig

Sets of Unicode characters, implemented as sets of intervals. The signature is mostly same to Set.S in stdlib

#
type t
#
val empty : t
#
val is_empty : t -> bool
#
val mem : UChar.t -> t -> bool
#
val add : UChar.t -> t -> t
#
val add_range : UChar.t -> UChar.t -> t -> t

add_range u1 u2 s adds the characters in the range u1 - u2 to s. The range is determined by the code point order.

#
val singleton : UChar.t -> t
#
val remove : UChar.t -> t -> t
#
val remove_range : UChar.t -> UChar.t -> t -> t

remove_range u1 u2 s removes the characters in the range u1 - u2 from s. The range is determined by the code point order.

#
val union : t -> t -> t
#
val inter : t -> t -> t
#
val diff : t -> t -> t
#
val compl : t -> t

compl s returns the compliment of s.

#
val compare : t -> t -> int
#
val equal : t -> t -> bool
#
val subset : t -> t -> bool
#
val from : UChar.t -> t -> t

from u s returns the set of elements of s whose code points are equal or greater than u.

#
val after : UChar.t -> t -> t

after u s returns the set of elements of s whose code points are greater than u.

#
val until : UChar.t -> t -> t

until u s returns the set of elements of s whose code points are equal or smaller than u.

#
val before : UChar.t -> t -> t

until u s returns the set of elements of s whose code points are smaller than u.

#
val iter : (UChar.t -> unit) -> t -> unit
#
val iter_range : (UChar.t -> UChar.t -> unit) -> t -> unit

iter_range proc s feeds the intervals contained in s to proc in increasing order. The intervals given to proc are always separated by the character not in s.

#
val fold : (UChar.t -> 'a -> 'a) -> t -> 'a -> 'a
#
val fold_range : (UChar.t -> UChar.t -> 'a -> 'a) -> t -> 'a -> 'a

fold_range f s x is equivalent to f u_i u_(i+1) (... (f u_3 u_4 (f u_1 u_2 x))) if s is consisted of the intervals u1-u2, u3-u4, ..., u_i-u_(i + 1) in increasing order. The intervals given to proc are always separated by the character not in s.

#
val for_all : (UChar.t -> bool) -> t -> bool
#
val exists : (UChar.t -> bool) -> t -> bool
#
val filter : (UChar.t -> bool) -> t -> t
#
val partition : (UChar.t -> bool) -> t -> t * t
#
val cardinal : t -> int
#
val elements : t -> UChar.t list
#
val ranges : t -> (UChar.t * UChar.t) list

The list of the intervals contained in the set. The returned intervals are always separated by the character not in s.

#
val min_elt : t -> UChar.t
#
val max_elt : t -> UChar.t
#
val choose : t -> UChar.t

Returns a element roughly in the middle of the set. It is not guaranteed to return the same element for the sets with the same elements

#
val uset_of_iset : ISet.t -> t
#
val iset_of_uset : t -> ISet.t
end
#
module UMap : sig
#
type 'a t

Maps over Unicode characters.

#
val empty : 'a t
#
val is_empty : 'a t -> bool
#
val add : ?eq:('a -> 'a -> bool) -> UChar.t -> 'a -> 'a t -> 'a t

add ?eq u v m returns the new map which is same to m except it maps u to some value v' which satisfies eq v v'. If eq is not supplied, structural equality is used.

#
val add_range : ?eq:('a -> 'a -> bool) -> UChar.t -> UChar.t -> 'a -> 'a t -> 'a t

add ?eq u1 u2 v m returns the new map which is same to m except it maps characters in the range u1-u2 to some value v' which satisfies eq v v'. If eq is not supplied, structural equality is used.

#
val find : UChar.t -> 'a t -> 'a
#
val remove : UChar.t -> 'a t -> 'a t
#
val remove_range : UChar.t -> UChar.t -> 'a t -> 'a t

remove_range u1 u2 m removes u1-u2 from the domain of m

#
val from : UChar.t -> 'a t -> 'a t

from u m restricts the domain of m to the characters whose code points are equal or greater than u.

#
val after : UChar.t -> 'a t -> 'a t

after u m restricts the domain of m to the characters whose code points are greater than u.

#
val until : UChar.t -> 'a t -> 'a t

until u m restricts the domain of m to the characters whose code points are equal or smaller than u.

#
val before : UChar.t -> 'a t -> 'a t

before u m restricts the domain of m to the characters whose code points are smaller than u.

#
val mem : UChar.t -> 'a t -> bool
#
val iter : (UChar.t -> 'a -> unit) -> 'a t -> unit
#
val iter_range : (UChar.t -> UChar.t -> 'a -> unit) -> 'a t -> unit

iter proc m : For each contingent region u1-u2 that is mapped to a constant v, proc u1 u2 v is called. The order of call is determined by increasing order on u1.

map ?eq f m and mapi ?eq f m : Similar to map and mapi in stdlib Map, but if the map m' is returned, it is only guaranteed that eq (find u m') (f (find u m )) is true for map and eq (find u m') (f u (find u m )) is true for mapi. If eq is not specified, structural equality is used.

#
val map : ?eq:('b -> 'b -> bool) -> ('a -> 'b) -> 'a t -> 'b t
#
val mapi : ?eq:('b -> 'b -> bool) -> (UChar.t -> 'a -> 'b) -> 'a t -> 'b t
#
val fold : (UChar.t -> 'b -> 'a -> 'a) -> 'b t -> 'a -> 'a
#
val fold_range : (UChar.t -> UChar.t -> 'b -> 'a -> 'a) -> 'b t -> 'a -> 'a

fold_range f m x is equivalent to f u_(2n) u_(2n+1) v_n (... (f u_1 u_2 v_1 x)) where all characters in the range u_(2k)-u_(2k+1) are mapped to v_k and u_1 < u_3 < ... in code point order. For each range u_(2k)-u_(2k+1) is separated by a character which is not mapped to v_k.

#
val set_to_map : USet.t -> 'a -> 'a t

Constant map.

#
val domain : 'a t -> USet.t

Domain.

#
val map_to_set : ('a -> bool) -> 'a t -> USet.t

map_to_set p m returns the set of characters which are mapped to values satisfying the predicate p by m.

#
val umap_of_imap : 'a IMap.t -> 'a t
#
val imap_of_umap : 'a t -> 'a IMap.t
end
#
module UCharTbl : sig

Fast lookup tables for Unicode. Accessible by constant time.

#
type 'a tbl

Fast lookup tables. Accessible by constant time.

#
type 'a t = 'a tbl
#
val get : 'a tbl -> UChar.t -> 'a
#
module type Type = sig
#
type elt
#
type t = elt tbl
#
val get : elt tbl -> UChar.t -> elt
#
val of_map : elt -> elt UMap.t -> t

of_map def m creates the table which has the same value to m. The table returns def for the characters for which m is undefined.

end
#
module Make : functor (H : Hashtbl.HashedType) -> Type with type elt = H.t

Equality and hash are necessary for table generation.

#
module Bool : sig

Tables for boolean values.

#
type t
#
val get : t -> UChar.t -> bool
#
val of_set : USet.t -> t
end
#
module Bits : sig

Tables for small (< 256, >=0) integers

#
type t
#
val of_map : int -> int UMap.t -> t
#
val get : t -> UChar.t -> int
end
#
module Bytes : sig

Tables for integers. If integers are not span the whole 31-bit or 63-bit values, Bytes.t is more space efficient than int tbl.

#
type t
#
val of_map : int -> int UMap.t -> t
#
val get : t -> UChar.t -> int
end
#
module Char : sig

Tables for bytes.

#
type t
#
val of_map : char -> char UMap.t -> t
#
val get : t -> UChar.t -> char
end
end
#
module UnicodeString : sig

Signature for Unicode strings. UText, XString, UTF8, UTF16, UCS4 have matched signatures to UStorage and satisfy the semantics described below. If users want to supply their own Unicode strings, please design the module with the following signature and properties.

#
module type Type = sig
#
type t

The type of string.

#
val get : t -> int -> UChar.t

get t i : i-th character of the storage.

#
val init : int -> (int -> UChar.t) -> t

init len f creates a new storage. the returned storage has length len, its nth-element is f n. f is called with integers 0 ... len - 1, only once for each integer. The call is in the increasing order f 0, f 1, f 2, ...

#
val length : t -> int

The number of Unicode characters in the storage

#
type index

locations in storages.

#
val look : t -> index -> UChar.t

look t i : The character in the location i of t.

#
val nth : t -> int -> index

nth t n : the location of the n-th character in t.

next x i, prev x i : The operation is valid if i points the valid element, i.e. the returned value may point the location beyond valid elements by one. If i does not point a valid element, the results are unspecified.

#
val next : t -> index -> index
#
val prev : t -> index -> index
#
val out_of_range : t -> index -> bool
#
val iter : (UChar.t -> unit) -> t -> unit
#
val compare : t -> t -> int
#
val first : t -> index

The location of the first character in the storage.

#
val last : t -> index

The location of the last character in the storage.

#
val move : t -> index -> int -> index

move t i n : if n >= 0, then returns n-th character after i and otherwise returns -n-th character before i. If there is no such character, or i does not point a valid character, the result is unspecified.

#
val compare_index : t -> index -> index -> int

compare_index t i j returns a positive integer if i is the location placed after j in t, 0 if i and j point the same location, and a negative integer if i is the location placed before j in t.

#
module Buf : sig

Character buffers. Similar to Buffer.

#
type buf
#
val create : int -> buf

create n creates the buffer. n is used to determine the initial size of the buffer. The meaning of n differs from modules to modules.

#
val contents : buf -> t
#
val clear : buf -> unit
#
val reset : buf -> unit
#
val add_char : buf -> UChar.t -> unit
#
val add_string : buf -> t -> unit
#
val add_buffer : buf -> buf -> unit
end
end
end
#
module UText : sig

An implementation of Unicode string.

An implementation of Unicode string. Internally, it uses integer array. The semantics matches the description of UStorage.

#
type mutability = [
| `Mutable
| `Immutable
]

Phantom type for distinguishing mutability

#
type 'a text
#
type utext = [
| `Immutable
] text
#
type ustring = [
| `Mutable
] text
#
type t = utext
#
val utext_of_ustring : ustring -> utext
#
val ustring_of_utext : utext -> ustring
#
val get : 'a text -> int -> UChar.t
#
val set : ustring -> int -> UChar.t -> unit

set s i u sets the i-th character in s to u.

#
type index
#
val look : 'a text -> index -> UChar.t
#
val nth : 'a text -> int -> index
#
val first : 'a text -> index
#
val last : 'a text -> index
#
val out_of_range : 'a text -> index -> bool
#
val compare_index : 'a text -> index -> index -> int
#
val next : 'a text -> index -> index
#
val prev : 'a text -> index -> index
#
val move : 'a text -> index -> int -> index
#
val length : 'a text -> int
#
val of_string : string -> utext

Conversion from Latin-1 strings.

#
val init : int -> (int -> UChar.t) -> utext
#
val init_ustring : int -> (int -> UChar.t) -> ustring
#
val make : int -> UChar.t -> ustring

The semantics of these function are similar to the equivalents of string.

#
val copy : ustring -> ustring
#
val sub : 'a text -> int -> int -> 'a text
#
val fill : ustring -> int -> int -> UChar.t -> unit
#
val blit : 'a text -> int -> ustring -> int -> int -> unit
#
val append : 'a text -> 'b text -> 'a text
#
val iter : (UChar.t -> unit) -> 'a text -> unit
#
val compare : 'a text -> 'b text -> int
#
module Buf : sig
#
type buf
#
val create : int -> buf

create n creates the buffer which initially can contain n Unicode characters.

#
val contents : buf -> t
#
val contents_string : buf -> ustring
#
val length : buf -> int
#
val clear : buf -> unit
#
val reset : buf -> unit
#
val add_char : buf -> UChar.t -> unit
#
val add_string : buf -> 'a text -> unit
#
val add_buffer : buf -> buf -> unit
end
end
#
module XString : sig

eXtensible Unicode string. The semantics matches the description of UStorage. The detail may be going to change.

#
type xstring
#
type t = xstring
#
val get : xstring -> int -> UChar.t
#
val set : xstring -> int -> UChar.t -> unit
#
val length : xstring -> int
#
val init : int -> (int -> UChar.t) -> xstring
#
type index
#
val look : xstring -> index -> UChar.t
#
val nth : xstring -> int -> index
#
val first : xstring -> index
#
val last : xstring -> index
#
val out_of_range : xstring -> index -> bool
#
val next : xstring -> index -> index
#
val prev : xstring -> index -> index
#
val move : xstring -> index -> int -> index
#
val compare_index : xstring -> index -> index -> int
#
val make : ?bufsize:int -> int -> UChar.t -> xstring
#
val clear : xstring -> unit
#
val reset : xstring -> unit
#
val copy : xstring -> xstring
#
val sub : xstring -> int -> int -> xstring
#
val add_char : xstring -> UChar.t -> unit
#
val add_text : xstring -> 'a UText.text -> unit
#
val add_xstring : xstring -> xstring -> unit
#
val shrink : xstring -> int -> unit
#
val append : xstring -> xstring -> xstring
#
val utext_of : xstring -> UText.t
#
val ustring_of : xstring -> UText.ustring
#
val iter : (UChar.t -> unit) -> xstring -> unit
#
val compare : t -> t -> int
#
module Buf : sig
#
type buf
#
val create : int -> buf
#
val contents : buf -> t
#
val length : buf -> int
#
val clear : buf -> unit
#
val reset : buf -> unit
#
val add_char : buf -> UChar.t -> unit
#
val add_string : buf -> t -> unit
#
val add_buffer : buf -> buf -> unit
end
end
#
module SubText : sig
#
module type Type = sig

Sub-texts, parts of original (ur-) texts. The signature and semantics matches those of UStorage.

#
type t
#
val get : t -> int -> UChar.t
#
val init : int -> (int -> UChar.t) -> t
#
val length : t -> int
#
type index
#
val look : t -> index -> UChar.t
#
val nth : t -> int -> index
#
val first : t -> index
#
val last : t -> index
#
val next : t -> index -> index
#
val prev : t -> index -> index
#
val move : t -> index -> int -> index
#
val out_of_range : t -> index -> bool
#
val compare_index : t -> index -> index -> int
#
val iter : (UChar.t -> unit) -> t -> unit
#
val compare : t -> t -> int
#
module Buf : sig
#
type buf
#
val create : int -> buf
#
val contents : buf -> t
#
val clear : buf -> unit
#
val reset : buf -> unit
#
val add_char : buf -> UChar.t -> unit
#
val add_string : buf -> t -> unit
#
val add_buffer : buf -> buf -> unit
end
#
type ur_text

The type of original texts.

#
type ur_index

The type of indexes of original texts.

#
val refer : ur_text -> ur_index -> ur_index -> t

refer t i j returns the part of t from i until j. The character pointed by j is not included in the result. If j is equal to i or located before j, the result is an empty string.

#
val excerpt : t -> ur_text

excerpt t copies the contents of t as a new ur_text.

#
val context : t -> ur_text * ur_index * ur_index

context t returns the tuple (s, i, j) such that t = refer s i j.

#
val ur_index_of : t -> index -> ur_index

Conversion from indexes of sub-texts to ur_texts.

end
#
module Make : functor (Text : UnicodeString.Type) -> Type with type ur_text = Text.t and type ur_index = Text.index
end
#
module ULine : sig

Line IO

Line I/O, conversion of line separators.

#
type separator = [
| `CR
| `LF
| `CRLF
| `NEL
| `LS
| `PS
]

Line separators.

  • `CR specifies carriage return.
  • `LF specifies linefeed.
  • `CRLF specifies the sequence of carriage return and linefeed.
  • `NEL specifies next line (\u0085).
  • `LS specifies Unicode line separator (\u2028).
  • `PS specifies Unicode paragraph separator (\u2029).
#
class input : separator -> UChar.t #OOChannel.#obj_input_channel -> [UChar.t] OOChannel.obj_input_channel

new input separator input_obj creates the new input channel object OOChannel.obj_input_channel which reads from input_obj and converts line separators (all of CR, LF, CRLF, NEL, LS, PS) to separator.

#
class output : separator -> UChar.t #OOChannel.#obj_output_channel -> [UChar.t] OOChannel.obj_output_channel

new output separator output_obj creates the new output channel object OOChannel.obj_output_channel which receives Unicode characters and converts line separators (all of CR, LF, CRLF, NEL, LS, PS) to separator.

#
module type Type = sig
#
type text
#
class input_line : UChar.t #OOChannel.#obj_input_channel -> [text] OOChannel.obj_input_channel

new input_line input_obj creates the new input channel object OOChannel.obj_input_channel which reads Unicode characters from input_obj and output lines. All of CR, LF, CRLF, NEL, LS, PS, as well as FF (formfeed) are recognised as a line separator.

#
class output_line : ?sp:separator option -> UChar.t #OOChannel.#obj_output_channel -> [text] OOChannel.obj_output_channel

new output_line ~sp output_obj create the new output channel object OOChannel.obj_output_channel which output each line to output_obj using sp as a line separator. If sp is omitted, linefeed (LF) is used.

end
#
module Make : functor (Text : UnicodeString.Type) -> Type with type text = Text.t
end
#
module Locale : sig

Camomile has a locale system similar to Java. A locale is a string with a form as "<LANG>_<COUNTRY>_<MODIFIER>..." where <LANG> is a 2-letter ISO 639 language code, <COUNTRY> is a 2-letter ISO 3166 country code. Some field may not present.

#
type t = string

Type of locales.

#
val read : string -> string -> (Pervasives.in_channel -> 'a) -> string -> 'a

read root suffix reader locale reads locale information using reader. Locale data is supposed to reside in root directory with the name locale.suffix. reader takes in_channel as an argument and read data from in_channel. If data is not found, then reader should raise Not_found. If the file is not found or reader raises Not_found, then more generic locales are tried. For example, if fr_CA.suffix is not found, then read tries fr.suffix. If fr.suffix is also not found, then the file root.suffix is tried. Still the data is not found, then Not_found is raised.

#
val contain : string -> string -> bool

contain loc1 loc2 : If loc1 is contained in loc2 then true otherwise false. For example, "fr" is contained in "fr_CA" while "en_CA" does not contain "fr"

end
#
module CharEncoding : CharEncoding.Interface
#
module UTF8 : sig

UTF-8 encoded Unicode strings. The type is normal string.

#
type t = string

UTF-8 encoded Unicode strings. The type is normal string.

#
exception Malformed_code
#
val validate : t -> unit

validate s successes if s is valid UTF-8, otherwise raises Malformed_code. Other functions assume strings are valid UTF-8, so it is prudent to test their validity for strings from untrusted origins.

#
val get : t -> int -> UChar.t

get s n returns n-th Unicode character of s. The call requires O(n)-time.

#
val init : int -> (int -> UChar.t) -> t

init len f returns a new string which contains len Unicode characters. The i-th Unicode character is initialized by f i

#
val length : t -> int

length s returns the number of Unicode characters contained in s

#
type index = int

Positions in the string represented by the number of bytes from the head. The location of the first character is 0

#
val nth : t -> int -> index

nth s n returns the position of the n-th Unicode character. The call requires O(n)-time

#
val first : t -> index

The position of the head of the first Unicode character.

#
val last : t -> index

The position of the head of the last Unicode character.

#
val look : t -> index -> UChar.t

look s i returns the Unicode character of the location i in the string s.

#
val out_of_range : t -> index -> bool

out_of_range s i tests whether i is a position inside of s.

#
val compare_index : t -> index -> index -> int

compare_index s i1 i2 returns a value < 0 if i1 is the position located before i2, 0 if i1 and i2 points the same location, a value > 0 if i1 is the position located after i2.

#
val next : t -> index -> index

next s i returns the position of the head of the Unicode character located immediately after i. If i is inside of s, the function always successes. If i is inside of s and there is no Unicode character after i, the position outside s is returned. If i is not inside of s, the behaviour is unspecified.

#
val prev : t -> index -> index

prev s i returns the position of the head of the Unicode character located immediately before i. If i is inside of s, the function always successes. If i is inside of s and there is no Unicode character before i, the position outside s is returned. If i is not inside of s, the behaviour is unspecified.

#
val move : t -> index -> int -> index

move s i n returns n-th Unicode character after i if n >= 0, n-th Unicode character before i if n < 0. If there is no such character, the result is unspecified.

#
val iter : (UChar.t -> unit) -> t -> unit

iter f s applies f to all Unicode characters in s. The order of application is same to the order of the Unicode characters in s.

#
val compare : t -> t -> int

Code point comparison by the lexicographic order. compare s1 s2 returns a positive integer if s1 > s2, 0 if s1 = s2, a negative integer if s1 < s2.

#
module Buf : sig

Buffer module for UTF-8 strings

#
type buf

Buffers for UTF-8 strings.

#
val create : int -> buf

create n creates the buffer with the initial size n-bytes.

#
val contents : buf -> t

contents buf returns the contents of the buffer.

#
val clear : buf -> unit

Empty the buffer, but retains the internal storage which was holding the contents

#
val reset : buf -> unit

Empty the buffer and de-allocate the internal storage.

#
val add_char : buf -> UChar.t -> unit

Add one Unicode character to the buffer.

#
val add_string : buf -> t -> unit

Add the UTF-8 string to the buffer.

#
val add_buffer : buf -> buf -> unit

add_buffer b1 b2 adds the contents of b2 to b1. The contents of b2 is not changed.

end
with type buf = Buffer.t
end
#
module UTF16 : sig
#
type t = (int, Bigarray.int16_unsigned_elt, Bigarray.c_layout) Bigarray.Array1.t

UTF-16 encoded string. the type is the bigarray of 16-bit integers. The characters must be 21-bits code points, and not surrogate points, 0xfffe, 0xffff. Bigarray.cma or Bigarray.cmxa must be linked when this module is used.

#
exception Malformed_code
#
val validate : t -> unit

validate s If s is valid UTF-16 then successes otherwise raises Malformed_code. Other functions assume strings are valid UTF-16, so it is prudent to test their validity for strings from untrusted origins.

All functions below assume strings are valid UTF-16. If not, the result is unspecified.

#
val get : t -> int -> UChar.t

get s n returns n-th Unicode character of s. The call requires O(n)-time.

#
exception Out_of_range
#
val init : int -> (int -> UChar.t) -> t

init len f returns a new string which contains len Unicode characters. The i-th Unicode character is initialized by f i if the character is not representable, raise Out_of_range.

#
val length : t -> int

length s returns the number of Unicode characters contained in s

#
type index = int

Positions in the string represented by the number of 16-bit unit from the head. The location of the first character is 0

#
val nth : t -> int -> index

nth s n returns the position of the n-th Unicode character. The call requires O(n)-time

#
val first : t -> index

first s : The position of the head of the last Unicode character.

#
val last : t -> index

last s : The position of the head of the last Unicode character.

#
val look : t -> index -> UChar.t

look s i returns the Unicode character of the location i in the string s.

#
val out_of_range : t -> index -> bool

out_of_range s i tests whether i is inside of s.

#
val compare_index : t -> index -> index -> int

compare_aux s i1 i2 returns

  • If i1 is the position located before i2, a value < 0,
  • If i1 and i2 points the same location, 0,
  • If i1 is the position located after i2, a value > 0.
#
val next : t -> index -> index

next s i returns the position of the head of the Unicode character located immediately after i.

  • If i is a valid position, the function always success.
  • If i is a valid position and there is no Unicode character after i, the position outside s is returned.
  • If i is not a valid position, the behaviour is undefined.
#
val prev : t -> index -> index

prev s i returns the position of the head of the Unicode character located immediately before i.

  • If i is a valid position, the function always success.
  • If i is a valid position and there is no Unicode character before i, the position outside s is returned.
  • If i is not a valid position, the behaviour is undefined.
#
val move : t -> index -> int -> index
#
val iter : (UChar.t -> unit) -> t -> unit

iter f s Apply f to all Unicode characters in s. The order of application is same to the order in the Unicode characters in s.

#
val compare : t -> t -> int

Code point comparison

#
module Buf : sig

Buffer module for UTF-16

#
type buf
#
val create : int -> buf

create n : creates the buffer with the initial size n.

The rest of functions is similar to the ones of Buffer in stdlib.

#
val contents : buf -> t
#
val clear : buf -> unit
#
val reset : buf -> unit
#
val add_char : buf -> UChar.t -> unit

if the character is not representable, raise Out_of_range

#
val add_string : buf -> t -> unit
#
val add_buffer : buf -> buf -> unit
end
end
#
module UCS4 : sig

UCS4 encoded string. The type is the bigarray of 32-bit integers. Bigarray.cma or Bigarray.cmxa must be linked when this module is used.

#
exception Malformed_code
#
val validate : t -> unit

validate s If s is valid UCS4 then successes otherwise raises Malformed_code. Other functions assume strings are valid UCS4, so it is prudent to test their validity for strings from untrusted origins.

All functions below assume strings are valid UCS4. If not, the result is unspecified.

#
val get : t -> int -> UChar.t

get s n returns n-th Unicode character of s.

#
val init : int -> (int -> UChar.t) -> t

init len f returns a new string which contains len Unicode characters. The i-th Unicode character is initialised by f i

#
val length : t -> int

length s returns the number of Unicode characters contained in s

#
type index = int

Positions in the string represented by the number of characters from the head. The location of the first character is 0

#
val nth : t -> int -> index

nth s n returns the position of the n-th Unicode character. The call requires O(n)-time

#
val first : t -> index

first s : The position of the head of the last Unicode character.

#
val last : t -> index

last s : The position of the head of the last Unicode character.

#
val look : t -> index -> UChar.t

look s i returns the Unicode character of the location i in the string s.

#
val out_of_range : t -> index -> bool

out_of_range s i tests whether i points the valid position of s.

#
val compare_index : t -> index -> index -> int

compare_aux s i1 i2 returns If i1 is the position located before i2, a value < 0, If i1 and i2 points the same location, 0, If i1 is the position located after i2, a value > 0.

#
val next : t -> index -> index

next s i returns the position of the head of the Unicode character located immediately after i. If i is a valid position, the function always success. If i is a valid position and there is no Unicode character after i, the position outside s is returned. If i is not a valid position, the behaviour is undefined.

#
val prev : t -> index -> index

prev s i returns the position of the head of the Unicode character located immediately before i. If i is a valid position, the function always success. If i is a valid position and there is no Unicode character before i, the position outside s is returned. If i is not a valid position, the behaviour is undefined.

#
val move : t -> index -> int -> index

move s i n : If n >= 0, returns n-th Unicode character after i. If n < 0, returns -n-th Unicode character before i. If there is no such character, the result is unspecified.

#
val iter : (UChar.t -> unit) -> t -> unit

iter f s : Apply f to all Unicode characters in s. The order of application is same to the order in the Unicode characters in s.

#
val compare : t -> t -> int

Code point comparison

#
module Buf : sig

Buffer module for UCS4

#
type buf

create n creates the buffer with the initial size n.

#
val create : int -> buf

The rest of functions is similar to the ones of Buffer in stdlib.

#
val contents : buf -> t
#
val clear : buf -> unit
#
val reset : buf -> unit
#
val add_char : buf -> UChar.t -> unit
#
val add_string : buf -> t -> unit
#
val add_buffer : buf -> buf -> unit
end
end
#
module UPervasives : sig

Functions for toplevel

#
type uchar = UChar.t

Aliases for UChar.uint_code, UChar.chr_of_uint

#
val int_of_uchar : uchar -> int
#
val uchar_of_int : int -> uchar
#
val escaped_uchar : uchar -> string
#
val escaped_utf8 : string -> string
#
val printer_utf8 : Format.formatter -> string -> unit
#
val printer_uchar : Format.formatter -> uchar -> unit
end
#
module URe : sig

Regular expression engine.

#
type regexp = [
| `Alt of regexp * regexp
| `Seq of regexp * regexp
| `Rep of regexp
| `Repn of regexp * int * int option
| `After of regexp
| `Before of regexp
| `Epsilon
| `Group of regexp
| `OneChar
| `String of UChar.t list
| `Set of USet.t
| `BoS
| `EoS
]

Abstract syntax trees of regular expressions.

#
type match_semantics = [
| `First
| `Shortest
| `Longest
]

Match semantics.

#
val no_group : regexp -> regexp

Remove `Group from the regular expressions.

#
module type Type = sig
#
type text
#
type index
#
type compiled_regexp
#
module SubText : SubText.Type with type ur_text = text and type ur_index = index
#
val compile : regexp -> compiled_regexp

Compile regular expressions.

#
val regexp_match : ?sem:match_semantics -> compiled_regexp -> text -> index -> SubText.t option array option

regexp_match ?sem r t i tries matching r and substrings of t beginning from i. If match successes, Some g is returned where g is the array containing the matched string of n-th group in the n-element. The matched string of the whole r is stored in the 0-th element. If matching fails, None is returned.

#
val string_match : compiled_regexp -> text -> index -> bool

string_match r t i tests whether r can match a substring of t beginning from i.

#
val search_forward : ?sem:match_semantics -> compiled_regexp -> text -> index -> SubText.t option array option

search_forward ?sem r t i searches a substring of t matching r from i. The returned value is similar to URe.Type.regexp_match.

end
#
module Make : functor (Text : UnicodeString.Type) -> Type with type text = Text.t and type index = Text.index
end
#
module UCharInfo : UCharInfo.Type
#
module UNF : sig
#
module type Type = UNF.Type
#
module Make : functor (Text : UnicodeString.Type) -> Type with type text = Text.t and type index = Text.index
end
#
module UCol : sig
#
type variable_option = [
| `Blanked
| `Non_ignorable
| `Shifted
| `Shift_Trimmed
]

How variables are handled

#
type precision = [
| `Primary
| `Secondary
| `Tertiary
| `Quaternary
]

Strength of comparison. For European languages, each strength roughly means as `Primary : Ignore accents and case `Secondary : Ignore case but accents are counted in. `Tertiary : Accents and case are counted in. For the case of `Shifted, `Shift_Trimmed, there is the fourth strength. `Quaternary : Variables such as - (hyphen) are counted in.

#
module type Type = UCol.Type
#
module Make : functor (Text : UnicodeString.Type) -> Type with type text = Text.t and type index = Text.index
end
#
module CaseMap : sig
#
module type Type = CaseMap.Type
#
module Make : functor (Text : UnicodeString.Type) -> Type with type text = Text.t
end
#
module UReStr : UReStr.Interface
#
module StringPrep : sig
#
module type Type = StringPrep.Type
#
module Make : functor (Text : UnicodeString.Type) -> Type with type text = Text.t
end
end
#
module Make : functor (Config : ConfigInt.Type) -> Type with module OOChannel = OOChannel and module UChar = UChar and module USet = USet and module UMap = UMap and module UCharTbl = UCharTbl and module UnicodeString = UnicodeString and module UText = UText and module XString = XString and module SubText = SubText and module ULine = ULine and module Locale = Locale and module CharEncoding = CharEncoding.Configure(Config) and module UTF8 = UTF8 and module UTF16 = UTF16 and module UCS4 = UCS4 and module UPervasives = UPervasives and module URe = URe and module UCharInfo = UCharInfo.Make(Config) and module UNF.Make = UNF.Make(Config) and module UCol.Make = UCol.Make(Config) and module CaseMap.Make = CaseMap.Make(Config) and module UReStr = UReStr.Configure(Config) and module StringPrep.Make = StringPrep.Make(Config)

All-in-one, configure once modules

end