mirror of
git://git.sv.gnu.org/emacs.git
synced 2026-02-17 01:34:21 +00:00
When making a readable page in EWW, include the <title> and similar tags
* lisp/net/eww.el (eww--walk-readability, eww-readable-dom): New functions. (eww-display-html): Call 'eww-readable-dom'. (eww-readable): Call 'eww-readable-dom'. Don't copy over 'eww-data' properties that our new readable page can handle on its own. (eww-score-readability): Rewrite in terms of 'eww--walk-readability'. Make obsolete. (eww-highest-readability): Make obsolete. * test/lisp/net/eww-tests.el (eww-test--lots-of-words) (eww-test--wordy-page): New variables... (eww-test/readable/toggle-display): ... use them. (eww-test/readable/default-readable): Make sure that the readable page includes the <title> and <link> tags (bug#77299).
This commit is contained in:
parent
faae9f572a
commit
2bdcf0250a
2 changed files with 102 additions and 39 deletions
106
lisp/net/eww.el
106
lisp/net/eww.el
|
|
@ -863,8 +863,7 @@ This replaces the region with the preprocessed HTML."
|
|||
(unless document
|
||||
(let ((dom (eww--parse-html-region (point) (point-max) charset)))
|
||||
(when (eww-default-readable-p url)
|
||||
(eww-score-readability dom)
|
||||
(setq dom (eww-highest-readability dom))
|
||||
(setq dom (eww-readable-dom dom))
|
||||
(with-current-buffer buffer
|
||||
(plist-put eww-data :readable t)))
|
||||
(setq document (eww-document-base url dom))))
|
||||
|
|
@ -1163,42 +1162,97 @@ adds a new entry to `eww-history'."
|
|||
(eww--parse-html-region (point-min) (point-max))))
|
||||
(base (plist-get eww-data :url)))
|
||||
(when make-readable
|
||||
(eww-score-readability dom)
|
||||
(setq dom (eww-highest-readability dom)))
|
||||
(setq dom (eww-readable-dom dom)))
|
||||
(when eww-readable-adds-to-history
|
||||
(eww-save-history)
|
||||
(eww--before-browse)
|
||||
(dolist (elem '(:source :url :title :next :previous :up :peer))
|
||||
(dolist (elem '(:source :url :peer))
|
||||
(plist-put eww-data elem (plist-get old-data elem))))
|
||||
(eww-display-document (eww-document-base base dom))
|
||||
(plist-put eww-data :readable make-readable)
|
||||
(eww--after-page-change)))
|
||||
|
||||
(defun eww-score-readability (node)
|
||||
(let ((score -1))
|
||||
(cond
|
||||
((memq (dom-tag node) '(script head comment))
|
||||
(setq score -2))
|
||||
((eq (dom-tag node) 'meta)
|
||||
(setq score -1))
|
||||
((eq (dom-tag node) 'img)
|
||||
(setq score 2))
|
||||
((eq (dom-tag node) 'a)
|
||||
(setq score (- (length (split-string (dom-text node))))))
|
||||
(t
|
||||
(defun eww--walk-readability (node callback &optional noscore)
|
||||
"Walk through all children of NODE to score readability.
|
||||
After scoring, call CALLBACK with the node and score. If NOSCORE is
|
||||
non-nil, don't actually compute a score; just call the callback."
|
||||
(let ((score nil))
|
||||
(unless noscore
|
||||
(cond
|
||||
((stringp node)
|
||||
(setq score (length (split-string node))
|
||||
noscore t))
|
||||
((memq (dom-tag node) '(script head comment))
|
||||
(setq score -2
|
||||
noscore t))
|
||||
((eq (dom-tag node) 'meta)
|
||||
(setq score -1
|
||||
noscore t))
|
||||
((eq (dom-tag node) 'img)
|
||||
(setq score 2
|
||||
noscore t))
|
||||
((eq (dom-tag node) 'a)
|
||||
(setq score (- (length (split-string (dom-text node))))
|
||||
noscore t))
|
||||
(t
|
||||
(setq score -1))))
|
||||
(when (consp node)
|
||||
(dolist (elem (dom-children node))
|
||||
(cond
|
||||
((stringp elem)
|
||||
(setq score (+ score (length (split-string elem)))))
|
||||
((consp elem)
|
||||
(setq score (+ score
|
||||
(or (cdr (assoc :eww-readability-score (cdr elem)))
|
||||
(eww-score-readability elem)))))))))
|
||||
;; Cache the score of the node to avoid recomputing all the time.
|
||||
(dom-set-attribute node :eww-readability-score score)
|
||||
(let ((subscore (eww--walk-readability elem callback noscore)))
|
||||
(when (and (not noscore) subscore)
|
||||
(incf score subscore)))))
|
||||
(funcall callback node score)
|
||||
score))
|
||||
|
||||
(defun eww-readable-dom (dom)
|
||||
"Return a readable version of DOM."
|
||||
(let ((head-nodes nil)
|
||||
(best-node nil)
|
||||
(best-score most-negative-fixnum))
|
||||
(eww--walk-readability
|
||||
dom
|
||||
(lambda (node score)
|
||||
(when (consp node)
|
||||
(when (and score (> score best-score)
|
||||
;; We set a lower bound to how long we accept that
|
||||
;; the readable portion of the page is going to be.
|
||||
(> (length (split-string (dom-texts node))) 100))
|
||||
(setq best-score score
|
||||
best-node node))
|
||||
;; Keep track of any <title> and <link> tags we find to include
|
||||
;; in the final document. EWW uses them for various features,
|
||||
;; like renaming the buffer or navigating to "next" and
|
||||
;; "previous" pages. NOTE: We could probably filter out
|
||||
;; stylesheet <link> tags here, though it doesn't really matter
|
||||
;; since we don't *do* anything with stylesheets...
|
||||
(when (memq (dom-tag node) '(title link))
|
||||
;; Copy the node, but not any of its (non-text) children.
|
||||
;; This way, we can ensure that we don't include a node
|
||||
;; directly in our list in addition to as a child of some
|
||||
;; other node in the list. This is ok for <title> and <link>
|
||||
;; tags, but might need changed if supporting other tags.
|
||||
(let* ((inner-text (dom-texts node ""))
|
||||
(new-node `(,(dom-tag node)
|
||||
,(dom-attributes node)
|
||||
,@(when (length> inner-text 0)
|
||||
(list inner-text)))))
|
||||
(push new-node head-nodes))))))
|
||||
(if (and best-node (not (eq best-node dom)))
|
||||
`(html nil
|
||||
(head nil ,@head-nodes)
|
||||
(body nil ,best-node))
|
||||
dom)))
|
||||
|
||||
(defun eww-score-readability (node)
|
||||
(declare (obsolete 'eww--walk-readability "31.1"))
|
||||
(eww--walk-readability
|
||||
node
|
||||
(lambda (node score)
|
||||
(when (and score (consp node))
|
||||
(dom-set-attribute node :eww-readability-score score)))))
|
||||
|
||||
(defun eww-highest-readability (node)
|
||||
(declare (obsolete 'eww-readable-dom "31.1"))
|
||||
(let ((result node)
|
||||
highest)
|
||||
(dolist (elem (dom-non-text-children node))
|
||||
|
|
|
|||
|
|
@ -29,6 +29,21 @@
|
|||
The default just returns an empty list of headers and the URL as the
|
||||
body.")
|
||||
|
||||
(defvar eww-test--lots-of-words
|
||||
(string-join (make-list 20 "All work and no play makes Jack a dull boy.")
|
||||
" ")
|
||||
"A long enough run of words to satisfy EWW's readable mode cutoff.")
|
||||
|
||||
(defvar eww-test--wordy-page
|
||||
(concat "<html>"
|
||||
"<head>"
|
||||
"<title>Welcome to my home page</title>"
|
||||
"<link rel=\"home\" href=\"somewhere.invalid\">"
|
||||
"</head><body>"
|
||||
"<a>This is an uninteresting sentence.</a>"
|
||||
"<div>" eww-test--lots-of-words "</div>"
|
||||
"</body></html>"))
|
||||
|
||||
(defmacro eww-test--with-mock-retrieve (&rest body)
|
||||
"Evaluate BODY with a mock implementation of `eww-retrieve'.
|
||||
This avoids network requests during our tests. Additionally, prepare a
|
||||
|
|
@ -201,19 +216,10 @@ This sets `eww-before-browse-history-function' to
|
|||
(eww-test--with-mock-retrieve
|
||||
(let* ((shr-width most-positive-fixnum)
|
||||
(shr-use-fonts nil)
|
||||
(words (string-join
|
||||
(make-list
|
||||
20 "All work and no play makes Jack a dull boy.")
|
||||
" "))
|
||||
(eww-test--response-function
|
||||
(lambda (_url)
|
||||
(concat "Content-Type: text/html\n\n"
|
||||
"<html><body>"
|
||||
"<a>This is an uninteresting sentence.</a>"
|
||||
"<div>"
|
||||
words
|
||||
"</div>"
|
||||
"</body></html>"))))
|
||||
eww-test--wordy-page))))
|
||||
(eww "example.invalid")
|
||||
;; Make sure EWW renders the whole document.
|
||||
(should-not (plist-get eww-data :readable))
|
||||
|
|
@ -224,7 +230,7 @@ This sets `eww-before-browse-history-function' to
|
|||
;; Now, EWW should render just the "readable" parts.
|
||||
(should (plist-get eww-data :readable))
|
||||
(should (string-match-p
|
||||
(concat "\\`" (regexp-quote words) "\n*\\'")
|
||||
(concat "\\`" (regexp-quote eww-test--lots-of-words) "\n*\\'")
|
||||
(buffer-substring-no-properties (point-min) (point-max))))
|
||||
(eww-readable 'toggle)
|
||||
;; Finally, EWW should render the whole document again.
|
||||
|
|
@ -240,11 +246,14 @@ This sets `eww-before-browse-history-function' to
|
|||
(let* ((eww-test--response-function
|
||||
(lambda (_url)
|
||||
(concat "Content-Type: text/html\n\n"
|
||||
"<html><body>Hello there</body></html>")))
|
||||
eww-test--wordy-page)))
|
||||
(eww-readable-urls '("://example\\.invalid/")))
|
||||
(eww "example.invalid")
|
||||
;; Make sure EWW uses "readable" mode.
|
||||
(should (plist-get eww-data :readable)))))
|
||||
(should (plist-get eww-data :readable))
|
||||
;; Make sure the page include the <title> and <link> nodes.
|
||||
(should (equal (plist-get eww-data :title) "Welcome to my home page"))
|
||||
(should (equal (plist-get eww-data :home) "somewhere.invalid")))))
|
||||
|
||||
(provide 'eww-tests)
|
||||
;; eww-tests.el ends here
|
||||
|
|
|
|||
Loading…
Reference in a new issue