Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions common/hstrings/strings.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ import (
"slices"
"strings"
"sync"
"unicode"
"unicode/utf8"

"github.com/gohugoio/hugo/compare"
)
Expand Down Expand Up @@ -128,6 +130,45 @@ func ToString(v any) (string, bool) {
return "", false
}

// CountWords returns the approximate word count in s, split by CJK and non-CJK
// CJK words are counted as number of characters
func CountWords(s string) (int, int) {
nCJK := 0
nNonCJK := 0
if hasCJK(s) {
for _, word := range strings.Fields(s) {
firstCharacter, _ := utf8.DecodeRuneInString(word)
if unicode.In(firstCharacter, unicode.Han, unicode.Hangul, unicode.Hiragana, unicode.Katakana) {
nCJK += utf8.RuneCountInString(word)
} else {
nNonCJK++
}
}
} else {
inWord := false
for _, r := range s {
wasInWord := inWord
inWord = !unicode.IsSpace(r)
if inWord && !wasInWord {
nNonCJK++
}
}
}

return nNonCJK, nCJK
}

// hasCJK reports whether the string s contains one or more Chinese, Japanese,
// or Korean (CJK) characters.
func hasCJK(s string) bool {
for _, r := range s {
if unicode.In(r, unicode.Han, unicode.Hangul, unicode.Hiragana, unicode.Katakana) {
return true
}
}
return false
}

type (
Strings2 [2]string
Strings3 [3]string
Expand Down
28 changes: 28 additions & 0 deletions common/hstrings/strings_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ package hstrings

import (
"regexp"
"strings"
"testing"

qt "github.com/frankban/quicktest"
Expand Down Expand Up @@ -54,3 +55,30 @@ func BenchmarkCompileRegexp(b *testing.B) {
regexp.MustCompile(`\d+`)
}
}

func BenchmarkCountWordsASCII(b *testing.B) {
s := "The quick brown fox jumps over the lazy dog"
sLong := strings.Repeat(s, 400) // 9 x 400 = 3600 words
b.ResetTimer()
for i := 0; i < b.N; i++ {
CountWords(sLong)
}
}

func BenchmarkCountWordsMixed(b *testing.B) {
s := "The 素早い brown fox jumps over the lazy 犬"
sLong := strings.Repeat(s, 400)
b.ResetTimer()
for i := 0; i < b.N; i++ {
CountWords(sLong)
}
}

func BenchmarkCountWordsChinese(b *testing.B) {
s := "敏捷的棕狐狸跳过懒惰的狗"
sLong := strings.Repeat(s, 400)
b.ResetTimer()
for i := 0; i < b.N; i++ {
CountWords(sLong)
}
}
3 changes: 0 additions & 3 deletions config/allconfig/allconfig.go
Original file line number Diff line number Diff line change
Expand Up @@ -664,9 +664,6 @@ type RootConfig struct {
// The default language code.
LanguageCode string

// Enable if the site content has CJK language (Chinese, Japanese, or Korean). This affects how Hugo counts words.
HasCJKLanguage bool

// The default number of pages per page when paginating.
// Deprecated: Use the Pagination struct.
Paginate int
Expand Down
3 changes: 0 additions & 3 deletions docs/content/en/configuration/all.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,6 @@ environment
frontmatter
: See [configure front matter](/configuration/front-matter/).

hasCJKLanguage
: (`bool`) Whether to automatically detect [CJK](g) languages in content. Affects the values returned by the [`WordCount`] and [`FuzzyWordCount`] methods. Default is `false`.

HTTPCache
: See [configure HTTP cache](/configuration/http-cache/).

Expand Down
3 changes: 0 additions & 3 deletions docs/content/en/content-management/front-matter.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,6 @@ expiryDate
headless
: (`bool`) Applicable to [leaf bundles], whether to set the `render` and `list` [build options] to `never`, creating a headless bundle of [page resources].

isCJKLanguage
: (`bool`) Whether the content language is in the [CJK](g) family. This value determines how Hugo calculates word count, and affects the values returned by the [`WordCount`], [`FuzzyWordCount`], [`ReadingTime`], and [`Summary`] methods on a `Page` object.

keywords
: (`string array`) An array of keywords, typically rendered within a `meta` element within the `head` element of the published HTML file, or used as a [taxonomy](g) to classify content. Access these values from a template using the [`Keywords`] method on a `Page` object.

Expand Down
1 change: 0 additions & 1 deletion docs/content/en/troubleshooting/inspection.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ Use the [`debug.Dump`] function to inspect a data structure:
{
"date": "2023-11-10T15:10:42-08:00",
"draft": false,
"iscjklanguage": false,
"lastmod": "2023-11-10T15:10:42-08:00",
"publishdate": "2023-11-10T15:10:42-08:00",
"tags": [
Expand Down
1 change: 0 additions & 1 deletion docs/layouts/_shortcodes/per-lang-config-keys.html
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
(dict "disablePathToLower" "/configuration/all/#disablepathtolower")
(dict "enableEmoji " "/configuration/all/#enableemoji")
(dict "frontmatter" "/configuration/front-matter/")
(dict "hasCJKLanguage" "/configuration/all/#hascjklanguage")
(dict "languageCode" "/configuration/all/#languagecode")
(dict "mainSections" "/configuration/all/#mainsections")
(dict "markup" "/configuration/markup/")
Expand Down
18 changes: 4 additions & 14 deletions helpers/content.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ import (
"bytes"
"html/template"
"strings"
"unicode"

"github.com/gohugoio/hugo/common/hexec"
"github.com/gohugoio/hugo/common/hstrings"
"github.com/gohugoio/hugo/common/loggers"
"github.com/gohugoio/hugo/media"

Expand Down Expand Up @@ -145,20 +145,10 @@ func (c *ContentSpec) ResolveMarkup(in string) string {
return ""
}

// TotalWords counts instance of one or more consecutive white space
// characters, as defined by unicode.IsSpace, in s.
// This is a cheaper way of word counting than the obvious len(strings.Fields(s)).
// TotalWords counts the approximate number of words (and CJK characters) in s
func TotalWords(s string) int {
n := 0
inWord := false
for _, r := range s {
wasInWord := inWord
inWord = !unicode.IsSpace(r)
if inWord && !wasInWord {
n++
}
}
return n
nonCjkWordCount, cjkWordCount := hstrings.CountWords(s)
return nonCjkWordCount + cjkWordCount
}

// TrimShortHTML removes the outer tags from HTML input where (a) the opening
Expand Down
16 changes: 7 additions & 9 deletions hugolib/cascade_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,18 +130,16 @@ cascade:

if withHomeContent {
b.Assert(p1.Params(), qt.DeepEquals, maps.Params{
"imgconfig": "img-config.jpg",
"draft": bool(false),
"iscjklanguage": bool(false),
"img1": "img1-home.jpg",
"img2": "img2-home.jpg",
"imgconfig": "img-config.jpg",
"draft": bool(false),
"img1": "img1-home.jpg",
"img2": "img2-home.jpg",
})
} else {
b.Assert(p1.Params(), qt.DeepEquals, maps.Params{
"img1": "img1-config.jpg",
"imgconfig": "img-config.jpg",
"draft": bool(false),
"iscjklanguage": bool(false),
"img1": "img1-config.jpg",
"imgconfig": "img-config.jpg",
"draft": bool(false),
})
}
})
Expand Down
2 changes: 1 addition & 1 deletion hugolib/hugo_smoke_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ Content Tag 1.

b.AssertFileContent("public/en/posts/p1/index.html",
"Single: en|page|/en/posts/p1/|Post 1|<p>Content 1.</p>\n|Len Resources: 2|",
"Resources: text|/en/posts/p1/f1.txt|text/plain|map[icon:enicon] - page||application/octet-stream|map[draft:false iscjklanguage:false title:Post Sub 1] -",
"Resources: text|/en/posts/p1/f1.txt|text/plain|map[icon:enicon] - page||application/octet-stream|map[draft:false title:Post Sub 1] -",
"Icon: enicon",
"Icon fingerprinted: enicon|/en/posts/p1/f1.e5746577af5cbfc4f34c558051b7955a9a5a795a84f1c6ab0609cb3473a924cb.txt|",
"NextInSection: |\nPrevInSection: /en/posts/p2/|Post 2|",
Expand Down
30 changes: 7 additions & 23 deletions hugolib/page__content.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@ import (
"path/filepath"
"strconv"
"strings"
"unicode/utf8"

maps0 "maps"

"github.com/bep/logg"
"github.com/gohugoio/hugo/common/hcontext"
"github.com/gohugoio/hugo/common/herrors"
"github.com/gohugoio/hugo/common/hstrings"
"github.com/gohugoio/hugo/common/hugio"
"github.com/gohugoio/hugo/common/hugo"
"github.com/gohugoio/hugo/common/maps"
Expand Down Expand Up @@ -620,8 +620,7 @@ func (c *cachedContentScope) contentRendered(ctx context.Context) (contentSummar

if !c.pi.hasSummaryDivider && cp.po.p.m.pageConfig.Summary == "" {
numWords := cp.po.p.s.conf.SummaryLength
isCJKLanguage := cp.po.p.m.pageConfig.IsCJKLanguage
summary := page.ExtractSummaryFromHTML(cp.po.p.m.pageConfig.ContentMediaType, string(result.content), numWords, isCJKLanguage)
summary := page.ExtractSummaryFromHTML(cp.po.p.m.pageConfig.ContentMediaType, string(result.content), numWords)
result.summary = page.Summary{
Text: template.HTML(summary.Summary()),
Type: page.SummaryTypeAuto,
Expand Down Expand Up @@ -817,32 +816,17 @@ func (c *cachedContentScope) contentPlain(ctx context.Context) (contentPlainPlai
result.plain = tpl.StripHTML(string(rendered.content))
result.plainWords = strings.Fields(result.plain)

isCJKLanguage := cp.po.p.m.pageConfig.IsCJKLanguage

if isCJKLanguage {
result.wordCount = 0
for _, word := range result.plainWords {
runeCount := utf8.RuneCountInString(word)
if len(word) == runeCount {
result.wordCount++
} else {
result.wordCount += runeCount
}
}
} else {
result.wordCount = helpers.TotalWords(result.plain)
}
nonCjkWordCount, cjkWordCount := hstrings.CountWords(result.plain)
result.wordCount = cjkWordCount + nonCjkWordCount

// TODO(bep) is set in a test. Fix that.
if result.fuzzyWordCount == 0 {
result.fuzzyWordCount = (result.wordCount + 100) / 100 * 100
}

if isCJKLanguage {
result.readingTime = (result.wordCount + 500) / 501
} else {
result.readingTime = (result.wordCount + 212) / 213
}
cjkReadingTime := (cjkWordCount + 500) / 501
nonCjkReadingTime := (nonCjkWordCount + 212) / 213
result.readingTime = cjkReadingTime + nonCjkReadingTime

rs.Value = result

Expand Down
20 changes: 1 addition & 19 deletions hugolib/page__meta.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ import (
"context"
"fmt"
"path/filepath"
"regexp"
"strings"
"time"

Expand Down Expand Up @@ -45,8 +44,6 @@ import (
"github.com/spf13/cast"
)

var cjkRe = regexp.MustCompile(`\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}`)

type pageMeta struct {
term string // Set for kind == KindTerm.
singular string // Set for kind == KindTerm and kind == KindTaxonomy.
Expand Down Expand Up @@ -471,7 +468,7 @@ params:
panic("params not set for " + p.Title())
}

var draft, published, isCJKLanguage *bool
var draft, published *bool
var userParams map[string]any
for k, v := range pcfg.Params {
loki := strings.ToLower(k)
Expand Down Expand Up @@ -578,9 +575,6 @@ params:
return fmt.Errorf("failed to decode sitemap config in front matter: %s", err)
}
sitemapSet = true
case "iscjklanguage":
isCJKLanguage = new(bool)
*isCJKLanguage = cast.ToBool(v)
case "translationkey":
pcfg.TranslationKey = cast.ToString(v)
params[loki] = pcfg.TranslationKey
Expand Down Expand Up @@ -663,18 +657,6 @@ params:
}
params["draft"] = pcfg.Draft

if isCJKLanguage != nil {
pcfg.IsCJKLanguage = *isCJKLanguage
} else if p.s.conf.HasCJKLanguage && p.m.content.pi.openSource != nil {
if cjkRe.Match(p.m.content.mustSource()) {
pcfg.IsCJKLanguage = true
} else {
pcfg.IsCJKLanguage = false
}
}

params["iscjklanguage"] = pcfg.IsCJKLanguage

if err := pcfg.Init(false); err != nil {
return err
}
Expand Down
Loading