Skip to content

Commit b55f7df

Browse files
authored
Add generators for UTF-8 strings (#319)
As reported in #318, `proper_types:string()` may generate invalid Unicode strings; the Erlang type language overapproximates the set of lists that represent valid Unicode strings. This `proper_unicode` module is therefore extended with functions that generate valid UTF-8 character lists.
1 parent 546818b commit b55f7df

File tree

3 files changed

+34
-14
lines changed

3 files changed

+34
-14
lines changed

include/proper.hrl

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
%%% -*- coding: utf-8; erlang-indent-level: 2 -*-
22
%%% -------------------------------------------------------------------
3-
%%% Copyright 2010-2022 Manolis Papadakis <[email protected]>,
3+
%%% Copyright 2010-2025 Manolis Papadakis <[email protected]>,
44
%%% Eirini Arvaniti <[email protected]>,
55
%%% and Kostis Sagonas <[email protected]>
66
%%%
@@ -19,7 +19,7 @@
1919
%%% You should have received a copy of the GNU General Public License
2020
%%% along with PropEr. If not, see <http://www.gnu.org/licenses/>.
2121

22-
%%% @copyright 2010-2022 Manolis Papadakis, Eirini Arvaniti, and Kostis Sagonas
22+
%%% @copyright 2010-2025 Manolis Papadakis, Eirini Arvaniti, and Kostis Sagonas
2323
%%% @version {@version}
2424
%%% @author Manolis Papadakis
2525
%%% @doc User header file: This file should be included in each file containing
@@ -70,7 +70,8 @@
7070
%% Unicode
7171
%%------------------------------------------------------------------------------
7272

73-
-import(proper_unicode, [utf8/0, utf8/1, utf8/2]).
73+
-import(proper_unicode, [utf8/0, utf8/1, utf8/2,
74+
utf8_string/0, utf8_string/1, utf8_string/2]).
7475

7576

7677
%%------------------------------------------------------------------------------

src/proper_unicode.erl

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
%%% -*- coding: utf-8 -*-
2-
%%% -*- erlang-indent-level: 2 -*-
1+
%%% -*- coding: utf-8; erlang-indent-level: 2 -*-
32
%%% -------------------------------------------------------------------
43
%%% Copyright 2014 Motiejus Jakstys <[email protected]>
54
%%%
@@ -24,10 +23,10 @@
2423

2524
%%% @doc Unicode generators for PropEr
2625
%%%
27-
%%% This module exposes utf8 binary generator.
26+
%%% This module exposes utf8 binary and string generators.
2827
%%%
29-
%%% Makes it easy to create custom-encoded unicode binaries. For example,
30-
%%% utf16 binary generator:
28+
%%% Makes it easy to create custom-encoded unicode binaries and strings.
29+
%%% For example, utf16 binary generator:
3130
%%%
3231
%%% ```
3332
%%% utf16() ->
@@ -40,18 +39,20 @@
4039
%%% ?FORALL(S, utf16(),
4140
%%% size(S) >= 2*length(unicode:characters_to_list(S, utf16))).
4241
%%% '''
43-
%%% Only utf8 generation is supported: {@link utf8/0}, {@link utf8/1}, {@link
44-
%%% utf8/2}. Unicode codepoints and other encodings are trivial to get with
45-
%%% utf8 generators and {@link unicode} module in OTP.
42+
43+
%%% Only utf8 generation is supported: see {@link utf8/0}, {@link utf8/1},
44+
%%% {@link utf8/2} which generate binaries and the corresponding functions
45+
%%% generating strings. Unicode codepoints and other encodings are trivial
46+
%%% to get with utf8 generators and the {@link unicode} module in OTP.
4647
-module(proper_unicode).
4748

48-
-export([utf8/0, utf8/1, utf8/2]).
49+
-export([utf8/0, utf8/1, utf8/2, utf8_string/0, utf8_string/1, utf8_string/2]).
4950

5051
-include("proper_common.hrl").
5152

5253
%% @private_type
5354
%% @alias
54-
-type nonnegextint() :: non_neg_integer() | 'inf'.
55+
-type nonnegextint() :: non_neg_integer() | 'inf'.
5556

5657

5758
%% @doc utf8-encoded unbounded size binary.
@@ -78,6 +79,21 @@ utf8(N, MaxCodePointSize) ->
7879
unicode:characters_to_binary(Str)).
7980

8081

82+
%% @doc utf8-encoded unbounded size string.
83+
-spec utf8_string() -> proper_types:type().
84+
utf8_string() ->
85+
utf8_string(inf, 4).
86+
87+
%% @doc utf8-encoded bounded upper size string.
88+
-spec utf8_string(nonnegextint()) -> proper_types:type().
89+
utf8_string(N) ->
90+
utf8_string(N, 4).
91+
92+
%% @doc Bounded upper size utf8 string, `codepoint length =< MaxCodePointSize'.
93+
-spec utf8_string(nonnegextint(), 1..4) -> proper_types:type().
94+
utf8_string(N, MaxCodePointSize) ->
95+
vector_upto(N, unicode_codepoint_upto(MaxCodePointSize)).
96+
8197
%% =============================================================================
8298
%% Internal functions
8399
%% =============================================================================

test/proper_tests.erl

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -937,7 +937,10 @@ native_type_props_test_() ->
937937
?_passes(?FORALL(B, utf8(2, 1), byte_size(B) =< 2)),
938938
?_passes(?FORALL(B, utf8(4), byte_size(B) =< 16)),
939939
?_passes(?FORALL(B, utf8(),
940-
length(unicode:characters_to_list(B)) =< byte_size(B)))
940+
length(unicode:characters_to_list(B)) =< byte_size(B))),
941+
?_passes(?FORALL(S, utf8_string(), unicode:characters_to_list(S) =:= S)),
942+
?_passes(?FORALL(S, utf8_string(4),
943+
byte_size(unicode:characters_to_binary(S)) =< 16))
941944
].
942945

943946
-type bin4() :: <<_:32>>.

0 commit comments

Comments
 (0)