[Commits] [SCM] claws branch, master, updated. 3.15.0-175-g8e9f89b

mones at claws-mail.org mones at claws-mail.org
Wed Nov 15 20:37:03 CET 2017


The branch, master has been updated
       via  8e9f89bef30238e178d71344e5135b030933956e (commit)
       via  4d178a21c1d763945c6f27bc01814d58f7fa3dd7 (commit)
       via  1285ad7143f2aa0cf1fc3b6bad8af828d49ffb4c (commit)
       via  d61f4bdfcbe0f6e081ce15f394f501818039d233 (commit)
      from  e61840c926adafa402746471347f9d7a233c5564 (commit)

Summary of changes:
 src/Makefile.am                       |    2 +
 src/entity.c                          |  403 +++++++++++++++++++++++++++++++++
 src/{gtk/sslcertwindow.h => entity.h} |   30 ++-
 src/html.c                            |  319 +-------------------------
 src/html.h                            |    8 +-
 src/plugins/rssyl/strutils.c          |   77 +------
 6 files changed, 441 insertions(+), 398 deletions(-)
 create mode 100644 src/entity.c
 copy src/{gtk/sslcertwindow.h => entity.h} (62%)


- Log -----------------------------------------------------------------
commit 8e9f89bef30238e178d71344e5135b030933956e
Author: Ricardo Mones <ricardo at mones.org>
Date:   Mon Nov 13 21:11:33 2017 +0100

    Use entity decoding API in HTML parser

diff --git a/src/html.c b/src/html.c
index 95f5a08..e982794 100644
--- a/src/html.c
+++ b/src/html.c
@@ -24,288 +24,12 @@
 #include "html.h"
 #include "codeconv.h"
 #include "utils.h"
+#include "entity.h"
 
 #define SC_HTMLBUFSIZE	8192
 #define HR_STR		"────────────────────────────────────────────────"
 #define LI_STR		"• "
 
-typedef struct _SC_HTMLSymbol	SC_HTMLSymbol;
-
-struct _SC_HTMLSymbol
-{
-	gchar *const key;
-	gchar *const val;
-};
-
-static SC_HTMLSymbol symbol_list[] = {
- {""", "\42"},
- {"&", "\46"},
- {"'", "\47"},
- {"<", "\74"},
- {">", "\76"},
- {""", "\42"},
- {"&", "\46"},
- {"'", "\47"},
- {"<", "\74"},
- {">", "\76"},
- {"’", "\47"},
- {"™", "\342\204\242"},
- {" ", "\40"},
- {"¡", "\302\241"},
- {"¢", "\302\242"},
- {"£", "\302\243"},
- {"¤", "\302\244"},
- {"¥", "\302\245"},
- {"¦", "\302\246"},
- {"§", "\302\247"},
- {"¨", "\302\250"},
- {"©", "\302\251"},
- {"ª", "\302\252"},
- {"«", "\302\253"},
- {"¬", "\302\254"},
- {"­", "\302\255"},
- {"®", "\302\256"},
- {"¯", "\302\257"},
- {"°", "\302\260"},
- {"±", "\302\261"},
- {"²", "\302\262"},
- {"³", "\302\263"},
- {"´", "\302\264"},
- {"µ", "\302\265"},
- {"¶", "\302\266"},
- {"·", "\302\267"},
- {"¸", "\302\270"},
- {"¹", "\302\271"},
- {"º", "\302\272"},
- {"»", "\302\273"},
- {"¼", "\302\274"},
- {"½", "\302\275"},
- {"¾", "\302\276"},
- {"¿", "\302\277"},
- {"À", "\303\200"},
- {"Á", "\303\201"},
- {"Â", "\303\202"},
- {"Ã", "\303\203"},
- {"Ä", "\303\204"},
- {"Å", "\303\205"},
- {"Æ", "\303\206"},
- {"Ç", "\303\207"},
- {"È", "\303\210"},
- {"É", "\303\211"},
- {"Ê", "\303\212"},
- {"Ë", "\303\213"},
- {"Ì", "\303\214"},
- {"Í", "\303\215"},
- {"Î", "\303\216"},
- {"Ï", "\303\217"},
- {"Ð", "\303\220"},
- {"Ñ", "\303\221"},
- {"Ò", "\303\222"},
- {"Ó", "\303\223"},
- {"Ô", "\303\224"},
- {"Õ", "\303\225"},
- {"Ö", "\303\226"},
- {"×", "\303\227"},
- {"Ø", "\303\230"},
- {"Ù", "\303\231"},
- {"Ú", "\303\232"},
- {"Û", "\303\233"},
- {"Ü", "\303\234"},
- {"Ý", "\303\235"},
- {"Þ", "\303\236"},
- {"ß", "\303\237"},
- {"à", "\303\240"},
- {"á", "\303\241"},
- {"â", "\303\242"},
- {"ã", "\303\243"},
- {"ä", "\303\244"},
- {"å", "\303\245"},
- {"æ", "\303\246"},
- {"ç", "\303\247"},
- {"è", "\303\250"},
- {"é", "\303\251"},
- {"ê", "\303\252"},
- {"ë", "\303\253"},
- {"ì", "\303\254"},
- {"í", "\303\255"},
- {"î", "\303\256"},
- {"ï", "\303\257"},
- {"ð", "\303\260"},
- {"ñ", "\303\261"},
- {"ò", "\303\262"},
- {"ó", "\303\263"},
- {"ô", "\303\264"},
- {"õ", "\303\265"},
- {"ö", "\303\266"},
- {"÷", "\303\267"},
- {"ø", "\303\270"},
- {"ù", "\303\271"},
- {"ú", "\303\272"},
- {"û", "\303\273"},
- {"ü", "\303\274"},
- {"ý", "\303\275"},
- {"þ", "\303\276"},
- {"ÿ", "\303\277"},
- {"Œ", "\305\222"},
- {"œ", "\305\223"},
- {"Š", "\305\240"},
- {"š", "\305\241"},
- {"Ÿ", "\305\270"},
- {"ˆ", "\313\206"},
- {"˜", "\313\234"},
- {" ", "\342\200\202"},
- {" ", "\342\200\203"},
- {" ", "\342\200\211"},
- {"–", "\342\200\223"},
- {"—", "\342\200\224"},
- {"‘", "\342\200\230"},
- {"’", "\342\200\231"},
- {"‚", "\342\200\232"},
- {"“", "\342\200\234"},
- {"”", "\342\200\235"},
- {"„", "\342\200\236"},
- {"†", "\342\200\240"},
- {"‡", "\342\200\241"},
- {"•", "\342\200\242"},
- {"…", "\342\200\246"},
- {"‰", "\342\200\260"},
- {"‹", "\342\200\271"},
- {"›", "\342\200\272"},
- {"€", "\342\202\254"},
- {"™", "\342\204\242"},
- {""", "\42"},
- {"&", "\46"},
- {"'", "\47"},
- {"<", "\74"},
- {">", "\76"},
- {"&squot;", "\47"},
- {" ", "\40"},
- {"¡", "\302\241"},
- {"¢", "\302\242"},
- {"£", "\302\243"},
- {"¤", "\302\244"},
- {"¥", "\302\245"},
- {"¦", "\302\246"},
- {"§", "\302\247"},
- {"¨", "\302\250"},
- {"©", "\302\251"},
- {"ª", "\302\252"},
- {"«", "\302\253"},
- {"¬", "\302\254"},
- {"­", "\302\255"},
- {"®", "\302\256"},
- {"¯", "\302\257"},
- {"°", "\302\260"},
- {"±", "\302\261"},
- {"&sup2;", "\302\262"},
- {"&sup3;", "\302\263"},
- {"´", "\302\264"},
- {"µ", "\302\265"},
- {"¶", "\302\266"},
- {"·", "\302\267"},
- {"¸", "\302\270"},
- {"&sup1;", "\302\271"},
- {"º", "\302\272"},
- {"»", "\302\273"},
- {"&frac14;", "\302\274"},
- {"&frac12;", "\302\275"},
- {"&frac34;", "\302\276"},
- {"¿", "\302\277"},
- {"À", "\303\200"},
- {"Á", "\303\201"},
- {"Â", "\303\202"},
- {"Ã", "\303\203"},
- {"Ä", "\303\204"},
- {"Å", "\303\205"},
- {"Æ", "\303\206"},
- {"Ç", "\303\207"},
- {"È", "\303\210"},
- {"É", "\303\211"},
- {"Ê", "\303\212"},
- {"Ë", "\303\213"},
- {"Ì", "\303\214"},
- {"Í", "\303\215"},
- {"Î", "\303\216"},
- {"Ï", "\303\217"},
- {"Ð", "\303\220"},
- {"Ñ", "\303\221"},
- {"Ò", "\303\222"},
- {"Ó", "\303\223"},
- {"Ô", "\303\224"},
- {"Õ", "\303\225"},
- {"Ö", "\303\226"},
- {"×", "\303\227"},
- {"Ø", "\303\230"},
- {"Ù", "\303\231"},
- {"Ú", "\303\232"},
- {"Û", "\303\233"},
- {"Ü", "\303\234"},
- {"Ý", "\303\235"},
- {"Þ", "\303\236"},
- {"ß", "\303\237"},
- {"à", "\303\240"},
- {"á", "\303\241"},
- {"â", "\303\242"},
- {"ã", "\303\243"},
- {"ä", "\303\244"},
- {"å", "\303\245"},
- {"æ", "\303\246"},
- {"ç", "\303\247"},
- {"è", "\303\250"},
- {"é", "\303\251"},
- {"ê", "\303\252"},
- {"ë", "\303\253"},
- {"ì", "\303\254"},
- {"í", "\303\255"},
- {"î", "\303\256"},
- {"ï", "\303\257"},
- {"ð", "\303\260"},
- {"ñ", "\303\261"},
- {"ò", "\303\262"},
- {"ó", "\303\263"},
- {"ô", "\303\264"},
- {"õ", "\303\265"},
- {"ö", "\303\266"},
- {"÷", "\303\267"},
- {"ø", "\303\270"},
- {"ù", "\303\271"},
- {"ú", "\303\272"},
- {"û", "\303\273"},
- {"ü", "\303\274"},
- {"ý", "\303\275"},
- {"þ", "\303\276"},
- {"ÿ", "\303\277"},
- {"Œ", "\305\222"},
- {"œ", "\305\223"},
- {"Š", "\305\240"},
- {"š", "\305\241"},
- {"Ÿ", "\305\270"},
- {"ˆ", "\313\206"},
- {"˜", "\313\234"},
- {" ", "\342\200\202"},
- {" ", "\342\200\203"},
- {" ", "\342\200\211"},
- {"–", "\342\200\223"},
- {"—", "\342\200\224"},
- {"‘", "\342\200\230"},
- {"’", "\342\200\231"},
- {"‚", "\342\200\232"},
- {"“", "\342\200\234"},
- {"”", "\342\200\235"},
- {"„", "\342\200\236"},
- {"†", "\342\200\240"},
- {"‡", "\342\200\241"},
- {"•", "\342\200\242"},
- {"…", "\342\200\246"},
- {"‰", "\342\200\260"},
- {"‹", "\342\200\271"},
- {"›", "\342\200\272"},
- {"€", "\342\202\254"},
- {"™", "\342\204\242"}
-};
-
-static GHashTable *default_symbol_table;
-
 static SC_HTMLState sc_html_read_line	(SC_HTMLParser	*parser);
 static void sc_html_append_char			(SC_HTMLParser	*parser,
 					 gchar		 ch);
@@ -340,16 +64,6 @@ SC_HTMLParser *sc_html_parser_new(FILE *fp, CodeConverter *conv)
 	parser->pre = FALSE;
 	parser->indent = 0;
 
-	if (!default_symbol_table) {
-		gint i;
-		default_symbol_table = g_hash_table_new(g_str_hash, g_str_equal);
-		for (i = 0; i < sizeof(symbol_list) / sizeof(symbol_list[0]); i++)
-			g_hash_table_insert(default_symbol_table,
-								symbol_list[i].key, symbol_list[i].val);
-	}
-
-	parser->symbol_table = default_symbol_table;
-
 	return parser;
 }
 
@@ -612,8 +326,7 @@ static void decode_href(SC_HTMLParser *parser)
 	tparser->str = g_string_new(NULL);
 	tparser->buf = g_string_new(parser->href);
 	tparser->bufp = tparser->buf->str;
-	tparser->symbol_table = default_symbol_table;
-	
+
 	tmp = sc_html_parse(tparser);
 	
 	g_free(parser->href);
@@ -725,33 +438,21 @@ static SC_HTMLState sc_html_parse_tag(SC_HTMLParser *parser)
 
 static void sc_html_parse_special(SC_HTMLParser *parser)
 {
-	gchar symbol_name[9];
-	gint n;
-	const gchar *val;
+	gchar *entity;
 
 	parser->state = SC_HTML_UNKNOWN;
 	cm_return_if_fail(*parser->bufp == '&');
 
-	/* &foo; */
-	for (n = 0; parser->bufp[n] != '\0' && parser->bufp[n] != ';'; n++)
-		;
-	if (n > 7 || parser->bufp[n] != ';') {
+	entity = entity_decode(parser->bufp);
+	if (entity != NULL) {
+		sc_html_append_str(parser, entity, -1);
+		g_free(entity);
+		while (*parser->bufp++ != ';');
+	} else {
 		/* output literal `&' */
 		sc_html_append_char(parser, *parser->bufp++);
-		parser->state = SC_HTML_NORMAL;
-		return;
 	}
-	strncpy2(symbol_name, parser->bufp, n + 2);
-	parser->bufp += n + 1;
-
-	if ((val = g_hash_table_lookup(parser->symbol_table, symbol_name))
-	    != NULL) {
-		sc_html_append_str(parser, val, -1);
-		parser->state = SC_HTML_NORMAL;
-		return;
-	} 
-
-	sc_html_append_str(parser, symbol_name, -1);
+	parser->state = SC_HTML_NORMAL;
 }
 
 static gchar *sc_html_find_tag(SC_HTMLParser *parser, const gchar *tag)
diff --git a/src/html.h b/src/html.h
index 98e2f3a..922389a 100644
--- a/src/html.h
+++ b/src/html.h
@@ -1,6 +1,6 @@
 /*
- * Sylpheed -- a GTK+ based, lightweight, and fast e-mail client
- * Copyright (C) 1999-2012 Hiroyuki Yamamoto and the Claws Mail team
+ * Claws Mail -- a GTK+ based, lightweight, and fast e-mail client
+ * Copyright (C) 1999-2017 Hiroyuki Yamamoto and the Claws Mail team
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -14,7 +14,6 @@
  *
  * You should have received a copy of the GNU General Public License
  * along with this program. If not, see <http://www.gnu.org/licenses/>.
- * 
  */
 
 #ifndef __HTML_H__
@@ -51,9 +50,6 @@ struct _SC_HTMLParser
 	FILE *fp;
 	CodeConverter *conv;
 
-	GHashTable *symbol_table;
-	GHashTable *alt_symbol_table;
-
 	GString *str;
 	GString *buf;
 

commit 4d178a21c1d763945c6f27bc01814d58f7fa3dd7
Author: Ricardo Mones <ricardo at mones.org>
Date:   Mon Nov 13 20:40:11 2017 +0100

    Rssyl: use new entity decoding API

diff --git a/src/plugins/rssyl/strutils.c b/src/plugins/rssyl/strutils.c
index fe8f148..7d172c8 100644
--- a/src/plugins/rssyl/strutils.c
+++ b/src/plugins/rssyl/strutils.c
@@ -30,6 +30,7 @@
 
 /* Claws Mail includes */
 #include <common/utils.h>
+#include <entity.h>
 
 /* Local includes */
 /* (shouldn't be any) */
@@ -120,28 +121,6 @@ struct _RSSyl_HTMLSymbol
 	gchar *const val;
 };
 
-/* TODO: find a way to offload this to a library which knows all the
- * defined named entities (over 200). */
-static RSSyl_HTMLSymbol symbol_list[] = {
-	{ "lt", "<" },
-	{ "gt", ">" },
-	{ "amp", "&" },
-	{ "apos", "'" },
-	{ "quot", "\"" },
-	{ "lsquo",  "‘" },
-	{ "rsquo",  "’" },
-	{ "ldquo",  "“" },
-	{ "rdquo",  "”" },
-	{ "nbsp", " " },
-	{ "trade", "™" },
-	{ "copy", "©" },
-	{ "reg", "®" },
-	{ "hellip", "…" },
-	{ "mdash", "—" },
-	{ "euro", "€" },
-	{ NULL, NULL }
-};
-
 static RSSyl_HTMLSymbol tag_list[] = {
 	{ "<cite>", "\"" },
 	{ "</cite>", "\"" },
@@ -160,55 +139,21 @@ static RSSyl_HTMLSymbol tag_list[] = {
 static gchar *rssyl_replace_chrefs(gchar *string)
 {
 	char *new = g_malloc0(strlen(string) + 1), *ret;
-	char buf[16], tmp[6];
-	int i, ii, j, n, len;
-	gunichar c;
-	gboolean valid, replaced;
+	gchar *entity;
+	int i, ii;
 
 	/* &xx; */
 	ii = 0;
 	for (i = 0; i < strlen(string); ++i) {
 		if (string[i] == '&') {
-			j = i+1;
-			n = 0;
-			valid = FALSE;
-			while (string[j] != '\0' && n < 16) {
-				if (string[j] != ';') {
-					buf[n++] = string[j];
-				} else {
-					/* End of entity */
-					valid = TRUE;
-					buf[n] = '\0';
-					break;
-				}
-				j++;
-			}
-			if (strlen(buf) > 0 && valid) {
-				replaced = FALSE;
-
-				if (buf[0] == '#' && (c = atoi(buf+1)) > 0) {
-					len = g_unichar_to_utf8(c, tmp);
-					tmp[len] = '\0';
-					g_strlcat(new, tmp, strlen(string));
-					ii += len;
-					replaced = TRUE;
-				} else {
-					for (c = 0; symbol_list[c].key != NULL; c++) {
-						if (!strcmp(buf, symbol_list[c].key)) {
-							g_strlcat(new, symbol_list[c].val, strlen(string));
-							ii += strlen(symbol_list[c].val);
-							replaced = TRUE;
-							break;
-						}
-					}
-				}
-				if (!replaced) {
-					new[ii++] = '&'; /* & */
-					g_strlcat(new, buf, strlen(string));
-					ii += strlen(buf);
-					new[ii++] = ';';
-				}
-				i = j;
+			entity = entity_decode(&(string[i]));
+			if (entity != NULL) {
+				g_strlcat(new, entity, strlen(string));
+				ii += strlen(entity);
+				g_free(entity);
+				entity = NULL;
+				while (string[++i] != ';');
+				--i; /* loop will inc it again */
 			} else {
 				new[ii++] = string[i];
 			}

commit 1285ad7143f2aa0cf1fc3b6bad8af828d49ffb4c
Author: Ricardo Mones <ricardo at mones.org>
Date:   Mon Nov 13 01:31:44 2017 +0100

    Complete, normalize and fix table of entities
    
    https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references

diff --git a/src/entity.c b/src/entity.c
index cc72291..98a01e4 100644
--- a/src/entity.c
+++ b/src/entity.c
@@ -37,137 +37,285 @@ struct _EntitySymbol
 	gchar *const value;
 };
 
+/* in alphabetical order with upper-case version first */
 static EntitySymbol symbolic_entities[] = {
-	/* in alphabetical order with upper-case version first */
-	{"Aacute", "\303\201"},
-	{"aacute", "\303\241"},
-	{"Acirc", "\303\202"},
-	{"acirc", "\303\242"},
-	{"acute", "\302\264"},
-	{"AElig", "\303\206"},
-	{"aelig", "\303\246"},
-	{"Agrave", "\303\200"},
-	{"agrave", "\303\240"},
-	{"amp", "&" },
-	{"apos", "'" },
-	{"Aring", "\303\205"},
-	{"aring", "\303\245"},
-	{"Atilde", "\303\203"},
-	{"atilde", "\303\243"},
-	{"Auml", "\303\204"},
-	{"auml", "\303\244"},
-	{"bdquo", "\342\200\236"},
-	{"brvbar", "\302\246"},
-	{"bull", "\342\200\242"},
-	{"Ccedil", "\303\207"},
-	{"ccedil", "\303\247"},
-	{"cedil", "\302\270"},
-	{"cent", "\302\242"},
-	{"circ", "\313\206"},
-	{"copy", "©" },
-	{"curren", "\302\244"},
-	{"dagger", "\342\200\240"},
-	{"Dagger", "\342\200\241"},
-	{"deg", "\302\260"},
-	{"divide", "\303\267"},
-	{"Eacute", "\303\211"},
-	{"eacute", "\303\251"},
-	{"Ecirc", "\303\212"},
-	{"ecirc", "\303\252"},
-	{"Egrave", "\303\210"},
-	{"egrave", "\303\250"},
-	{"emsp", "\342\200\203"},
-	{"ensp", "\342\200\202"},
-	{"ETH", "\303\220"},
-	{"eth", "\303\260"},
-	{"Euml", "\303\213"},
-	{"euml", "\303\253"},
-	{"euro", "€" },
-	{"frac12", "\302\275"},
-	{"frac14", "\302\274"},
-	{"frac34", "\302\276"},
-	{"gt", ">" },
-	{"hellip", "…" },
-	{"Iacute", "\303\215"},
-	{"iacute", "\303\255"},
-	{"Icirc", "\303\216"},
-	{"icirc", "\303\256"},
-	{"iexcl", "\302\241"},
-	{"Igrave", "\303\214"},
-	{"igrave", "\303\254"},
-	{"iquest", "\302\277"},
-	{"Iuml", "\303\217"},
-	{"iuml", "\303\257"},
-	{"laquo", "\302\253"},
-	{"ldquo",  "“" },
-	{"lsaquo", "\342\200\271"},
-	{"lsquo",  "‘" },
-	{"lt", "<" },
-	{"macr", "\302\257"},
-	{"mdash", "—" },
-	{"micro", "\302\265"},
-	{"middot", "\302\267"},
-	{"nbsp", " " },
-	{"ndash", "\342\200\223"},
-	{"not", "\302\254"},
-	{"Ntilde", "\303\221"},
-	{"ntilde", "\303\261"},
-	{"Oacute", "\303\223"},
-	{"oacute", "\303\263"},
-	{"Ocirc", "\303\224"},
-	{"ocirc", "\303\264"},
-	{"OElig", "\305\222"},
-	{"oelig", "\305\223"},
-	{"Ograve", "\303\222"},
-	{"ograve", "\303\262"},
-	{"ordf", "\302\252"},
-	{"ordm", "\302\272"},
-	{"Oslash", "\303\230"},
-	{"oslash", "\303\270"},
-	{"Otilde", "\303\225"},
-	{"otilde", "\303\265"},
-	{"Ouml", "\303\226"},
-	{"ouml", "\303\266"},
-	{"para", "\302\266"},
-	{"permil", "\342\200\260"},
-	{"plusmn", "\302\261"},
-	{"pound", "\302\243"},
-	{"quot", "\"" },
-	{"raquo", "\302\273"},
-	{"rdquo",  "”" },
-	{"reg", "®" },
-	{"rsaquo", "\342\200\272"},
-	{"rsquo",  "’" },
-	{"sbquo", "\342\200\232"},
-	{"Scaron", "\305\240"},
-	{"scaron", "\305\241"},
-	{"sect", "\302\247"},
-	{"shy", "\302\255"},
-	{"squot", "\47"},
-	{"sup1", "\302\271"},
-	{"sup2", "\302\262"},
-	{"sup3", "\302\263"},
-	{"szlig", "\303\237"},
-	{"thinsp", "\342\200\211"},
-	{"THORN", "\303\236"},
-	{"thorn", "\303\276"},
-	{"tilde", "\313\234"},
-	{"times", "\303\227"},
-	{"trade", "™" },
-	{"Uacute", "\303\232"},
-	{"uacute", "\303\272"},
-	{"Ucirc", "\303\233"},
-	{"ucirc", "\303\273"},
-	{"Ugrave", "\303\231"},
-	{"ugrave", "\303\271"},
-	{"uml", "\302\250"},
-	{"Uuml", "\303\234"},
-	{"uuml", "\303\274"},
-	{"Yacute", "\303\235"},
-	{"yacute", "\303\275"},
-	{"yen", "\302\245"},
-	{"yuml", "\303\277"},
-	{"Yuml", "\305\270"},
+	/* A */
+	{"Aacute", "Á"},
+	{"aacute", "á"},
+	{"Acirc", "Â"},
+	{"acirc", "â"},
+	{"acute", "´"},
+	{"AElig", "Æ"},
+	{"aelig", "æ"},
+	{"Agrave", "À"},
+	{"agrave", "à"},
+	{"alefsym", "ℵ"},
+	{"Alpha", "Α"},
+	{"alpha", "α"},
+	{"amp", "&"},
+	{"and", "∧"},
+	{"ang", "∠"},
+	{"apos", "'"},
+	{"Aring", "Å"},
+	{"aring", "å"},
+	{"asymp", "≈"},
+	{"Atilde", "Ã"},
+	{"atilde", "ã"},
+	{"Auml", "Ä"},
+	{"auml", "ä"},
+	/* B */
+	{"bdquo", "„"},
+	{"Beta", "Β"},
+	{"beta", "β"},
+	{"brvbar", "¦"},
+	{"bull", "•"},
+	/* C */
+	{"cap", "∩"},
+	{"Ccedil", "Ç"},
+	{"ccedil", "ç"},
+	{"cedil", "¸"},
+	{"cent", "¢"},
+	{"Chi", "Χ"},
+	{"chi", "χ"},
+	{"circ", "ˆ"},
+	{"clubs", "♣"},
+	{"cong", "≅"},
+	{"copy", "©"},
+	{"crarr", "↵"},
+	{"cup", "∪"},
+	{"curren", "¤"},
+	/* D */
+	{"dagger", "†"},
+	{"Dagger", "‡"},
+	{"dArr", "⇓"},
+	{"darr", "↓"},
+	{"deg", "°"},
+	{"Delta", "Δ"},
+	{"delta", "δ"},
+	{"diams", "♦"},
+	{"divide", "÷"},
+	/* E */
+	{"Eacute", "É"},
+	{"eacute", "é"},
+	{"Ecirc", "Ê"},
+	{"ecirc", "ê"},
+	{"Egrave", "È"},
+	{"egrave", "è"},
+	{"empty", "∅"},
+	{"emsp", "\xE2\x80\x83"},
+	{"ensp", "\xE2\x80\x82"},
+	{"Epsilon", "Ε"},
+	{"epsilon", "ε"},
+	{"equiv", "≡"},
+	{"Eta", "Η"},
+	{"eta", "η"},
+	{"ETH", "Ð"},
+	{"eth", "ð"},
+	{"Euml", "Ë"},
+	{"euml", "ë"},
+	{"euro", "€"},
+	{"exist", "∃"},
+	/* F */
+	{"fnof", "ƒ"},
+	{"forall", "∀"},
+	{"frac12", "½"},
+	{"frac14", "¼"},
+	{"frac34", "¾"},
+	{"frasl", "⁄"},
+	/* G */
+	{"Gamma", "Γ"},
+	{"gamma", "γ"},
+	{"ge", "≥"},
+	{"gt", ">"},
+	/* H */
+	{"hArr", "⇔"},
+	{"harr", "↔"},
+	{"hearts", "♥"},
+	{"hellip", "…"},
+	/* I */
+	{"Iacute", "Í"},
+	{"iacute", "í"},
+	{"IArr", "⇐"},
+	{"Icirc", "Î"},
+	{"icirc", "î"},
+	{"iexcl", "¡"},
+	{"Igrave", "Ì"},
+	{"igrave", "ì"},
+	{"image", "ℑ"},
+	{"infin", "∞"},
+	{"int", "∫"},
+	{"Iota", "Ι"},
+	{"iota", "ι"},
+	{"iquest", "¿"},
+	{"isin", "∈"},
+	{"Iuml", "Ï"},
+	{"iuml", "ï"},
+	/* K */
+	{"Kappa", "Κ"},
+	{"kappa", "κ"},
+	/* L */
+	{"Lambda", "Λ"},
+	{"lambda", "λ"},
+	{"lang", "〈"},
+	{"laquo", "«"},
+	{"larr", "←"},
+	{"lceil", "⌈"},
+	{"ldquo", "“"},
+	{"le", "≤"},
+	{"lfloor", "⌊"},
+	{"lowast", "∗"},
+	{"loz", "◊"},
+	{"lrm", "\xE2\x80\x8E"},
+	{"lsaquo", "‹"},
+	{"lsquo", "‘"},
+	{"lt", "<"},
+	/* M */
+	{"macr", "¯"},
+	{"mdash", "—"},
+	{"micro", "µ"},
+	{"middot", "·"},
+	{"minus", "−"},
+	{"Mu", "Μ"},
+	{"mu", "μ"},
+	/* N */
+	{"nabla", "∇"},
+	{"nbsp", "\xC2\xA0"},
+	{"ndash", "–"},
+	{"ne", "≠"},
+	{"ni", "∋"},
+	{"not", "¬"},
+	{"notin", "∉"},
+	{"nsub", "⊄"},
+	{"Ntilde", "Ñ"},
+	{"ntilde", "ñ"},
+	{"Nu", "Ν"},
+	{"nu", "ν"},
+	/* O */
+	{"Oacute", "Ó"},
+	{"oacute", "ó"},
+	{"Ocirc", "Ô"},
+	{"ocirc", "ô"},
+	{"OElig", "Œ"},
+	{"oelig", "œ"},
+	{"Ograve", "Ò"},
+	{"ograve", "ò"},
+	{"oline", "‾"},
+	{"Omega", "Ω"},
+	{"omega", "ω"},
+	{"Omicron", "Ο"},
+	{"omicron", "ο"},
+	{"oplus", "⊕"},
+	{"or", "∨"},
+	{"ordf", "ª"},
+	{"ordm", "º"},
+	{"Oslash", "Ø"},
+	{"oslash", "ø"},
+	{"Otilde", "Õ"},
+	{"otilde", "õ"},
+	{"otimes", "⊗"},
+	{"Ouml", "Ö"},
+	{"ouml", "ö"},
+	/* P */
+	{"para", "¶"},
+	{"part", "∂"},
+	{"permil", "‰"},
+	{"perp", "⊥"},
+	{"Phi", "Φ"},
+	{"phi", "φ"},
+	{"Pi", "Π"},
+	{"pi", "π"},
+	{"piv", "ϖ"},
+	{"plusmn", "±"},
+	{"pound", "£"},
+	{"Prime", "″"},
+	{"prime", "′"},
+	{"prod", "∏"},
+	{"prop", "∝"},
+	{"Psi", "Ψ"},
+	{"psi", "ψ"},
+	/* Q */
+	{"quot", "\""},
+	/* R */
+	{"radic", "√"},
+	{"rang", "〉"},
+	{"raquo", "»"},
+	{"rArr", "⇒"},
+	{"rarr", "→"},
+	{"rceil", "⌉"},
+	{"rdquo", "”"},
+	{"real", "ℜ"},
+	{"reg", "®"},
+	{"rfloor", "⌋"},
+	{"Rho", "Ρ"},
+	{"rho", "ρ"},
+	{"rlm", "\xE2\x80\x8F"},
+	{"rsaquo", "›"},
+	{"rsquo", "’"},
+	/* S */
+	{"sbquo", "‚"},
+	{"Scaron", "Š"},
+	{"scaron", "š"},
+	{"sdot", "⋅"},
+	{"sect", "§"},
+	{"shy", "\xC2\xAD"},
+	{"Sigma", "Σ"},
+	{"sigma", "σ"},
+	{"sigmaf", "ς"},
+	{"sim", "∼"},
+	{"spades", "♠"},
+	{"sub", "⊂"},
+	{"sube", "⊆"},
+	{"sum", "∑"},
+	{"sup", "⊃"},
+	{"sup1", "¹"},
+	{"sup2", "²"},
+	{"sup3", "³"},
+	{"supe", "⊇"},
+	{"szlig", "ß"},
+	/* T */
+	{"Tau", "Τ"},
+	{"tau", "τ"},
+	{"there4", "∴"},
+	{"Theta", "Θ"},
+	{"theta", "θ"},
+	{"thetasym", "ϑ"},
+	{"thinsp", "\xE2\x80\x89"},
+	{"THORN", "Þ"},
+	{"thorn", "þ"},
+	{"tilde", "˜"},
+	{"times", "×"},
+	{"trade", "™"},
+	/* U */
+	{"Uacute", "Ú"},
+	{"uacute", "ú"},
+	{"uArr", "⇑"},
+	{"uarr", "↑"},
+	{"Ucirc", "Û"},
+	{"ucirc", "û"},
+	{"Ugrave", "Ù"},
+	{"ugrave", "ù"},
+	{"uml", "¨"},
+	{"upsih", "ϒ"},
+	{"Upsilon", "Υ"},
+	{"upsilon", "υ"},
+	{"Uuml", "Ü"},
+	{"uuml", "ü"},
+	/* W */
+	{"weierp", "℘"},
+	/* X */
+	{"Xi", "Ξ"},
+	{"xi", "ξ"},
+	/* Y */
+	{"Yacute", "Ý"},
+	{"yacute", "ý"},
+	{"yen", "¥"},
+	{"Yuml", "Ÿ"},
+	{"yuml", "ÿ"},
+	/* Z */
+	{"Zeta", "Ζ"},
+	{"zeta", "ζ"},
+	{"zwj", "\xE2\x80\x8D"},
+	{"zwnj", "\xE2\x80\x8C"},
 	{NULL, NULL}
 };
 

commit d61f4bdfcbe0f6e081ce15f394f501818039d233
Author: Ricardo Mones <ricardo at mones.org>
Date:   Mon Nov 6 23:41:27 2017 +0100

    Implement HTML entity decoding in one function
    
    Content of symbols table remixed from existing html.c and
    rssyl/strutils.c tables.

diff --git a/src/Makefile.am b/src/Makefile.am
index 1db2b0d..d4eafbc 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -142,6 +142,7 @@ claws_mail_SOURCES = \
 	displayheader.c \
 	edittags.c \
 	enriched.c \
+	entity.c \
 	export.c \
         file_checker.c \
 	filtering.c \
@@ -260,6 +261,7 @@ claws_mailinclude_HEADERS = \
 	displayheader.h \
 	edittags.h \
 	enriched.h \
+	entity.h \
 	export.h \
 	filtering.h \
 	folder.h \
diff --git a/src/entity.c b/src/entity.c
new file mode 100644
index 0000000..cc72291
--- /dev/null
+++ b/src/entity.c
@@ -0,0 +1,255 @@
+/*
+ * Claws Mail -- a GTK+ based, lightweight, and fast e-mail client
+ * Copyright (C) 2017 Ricardo Mones and the Claws Mail team
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#include "claws-features.h"
+#endif
+
+#include "defs.h"
+#include "utils.h"
+#include "entity.h"
+
+#define ENTITY_MAX_LEN 8
+#define DECODED_MAX_LEN 6
+
+static GHashTable *symbol_table = NULL;
+
+typedef struct _EntitySymbol EntitySymbol;
+
+struct _EntitySymbol
+{
+	gchar *const key;
+	gchar *const value;
+};
+
+static EntitySymbol symbolic_entities[] = {
+	/* in alphabetical order with upper-case version first */
+	{"Aacute", "\303\201"},
+	{"aacute", "\303\241"},
+	{"Acirc", "\303\202"},
+	{"acirc", "\303\242"},
+	{"acute", "\302\264"},
+	{"AElig", "\303\206"},
+	{"aelig", "\303\246"},
+	{"Agrave", "\303\200"},
+	{"agrave", "\303\240"},
+	{"amp", "&" },
+	{"apos", "'" },
+	{"Aring", "\303\205"},
+	{"aring", "\303\245"},
+	{"Atilde", "\303\203"},
+	{"atilde", "\303\243"},
+	{"Auml", "\303\204"},
+	{"auml", "\303\244"},
+	{"bdquo", "\342\200\236"},
+	{"brvbar", "\302\246"},
+	{"bull", "\342\200\242"},
+	{"Ccedil", "\303\207"},
+	{"ccedil", "\303\247"},
+	{"cedil", "\302\270"},
+	{"cent", "\302\242"},
+	{"circ", "\313\206"},
+	{"copy", "©" },
+	{"curren", "\302\244"},
+	{"dagger", "\342\200\240"},
+	{"Dagger", "\342\200\241"},
+	{"deg", "\302\260"},
+	{"divide", "\303\267"},
+	{"Eacute", "\303\211"},
+	{"eacute", "\303\251"},
+	{"Ecirc", "\303\212"},
+	{"ecirc", "\303\252"},
+	{"Egrave", "\303\210"},
+	{"egrave", "\303\250"},
+	{"emsp", "\342\200\203"},
+	{"ensp", "\342\200\202"},
+	{"ETH", "\303\220"},
+	{"eth", "\303\260"},
+	{"Euml", "\303\213"},
+	{"euml", "\303\253"},
+	{"euro", "€" },
+	{"frac12", "\302\275"},
+	{"frac14", "\302\274"},
+	{"frac34", "\302\276"},
+	{"gt", ">" },
+	{"hellip", "…" },
+	{"Iacute", "\303\215"},
+	{"iacute", "\303\255"},
+	{"Icirc", "\303\216"},
+	{"icirc", "\303\256"},
+	{"iexcl", "\302\241"},
+	{"Igrave", "\303\214"},
+	{"igrave", "\303\254"},
+	{"iquest", "\302\277"},
+	{"Iuml", "\303\217"},
+	{"iuml", "\303\257"},
+	{"laquo", "\302\253"},
+	{"ldquo",  "“" },
+	{"lsaquo", "\342\200\271"},
+	{"lsquo",  "‘" },
+	{"lt", "<" },
+	{"macr", "\302\257"},
+	{"mdash", "—" },
+	{"micro", "\302\265"},
+	{"middot", "\302\267"},
+	{"nbsp", " " },
+	{"ndash", "\342\200\223"},
+	{"not", "\302\254"},
+	{"Ntilde", "\303\221"},
+	{"ntilde", "\303\261"},
+	{"Oacute", "\303\223"},
+	{"oacute", "\303\263"},
+	{"Ocirc", "\303\224"},
+	{"ocirc", "\303\264"},
+	{"OElig", "\305\222"},
+	{"oelig", "\305\223"},
+	{"Ograve", "\303\222"},
+	{"ograve", "\303\262"},
+	{"ordf", "\302\252"},
+	{"ordm", "\302\272"},
+	{"Oslash", "\303\230"},
+	{"oslash", "\303\270"},
+	{"Otilde", "\303\225"},
+	{"otilde", "\303\265"},
+	{"Ouml", "\303\226"},
+	{"ouml", "\303\266"},
+	{"para", "\302\266"},
+	{"permil", "\342\200\260"},
+	{"plusmn", "\302\261"},
+	{"pound", "\302\243"},
+	{"quot", "\"" },
+	{"raquo", "\302\273"},
+	{"rdquo",  "”" },
+	{"reg", "®" },
+	{"rsaquo", "\342\200\272"},
+	{"rsquo",  "’" },
+	{"sbquo", "\342\200\232"},
+	{"Scaron", "\305\240"},
+	{"scaron", "\305\241"},
+	{"sect", "\302\247"},
+	{"shy", "\302\255"},
+	{"squot", "\47"},
+	{"sup1", "\302\271"},
+	{"sup2", "\302\262"},
+	{"sup3", "\302\263"},
+	{"szlig", "\303\237"},
+	{"thinsp", "\342\200\211"},
+	{"THORN", "\303\236"},
+	{"thorn", "\303\276"},
+	{"tilde", "\313\234"},
+	{"times", "\303\227"},
+	{"trade", "™" },
+	{"Uacute", "\303\232"},
+	{"uacute", "\303\272"},
+	{"Ucirc", "\303\233"},
+	{"ucirc", "\303\273"},
+	{"Ugrave", "\303\231"},
+	{"ugrave", "\303\271"},
+	{"uml", "\302\250"},
+	{"Uuml", "\303\234"},
+	{"uuml", "\303\274"},
+	{"Yacute", "\303\235"},
+	{"yacute", "\303\275"},
+	{"yen", "\302\245"},
+	{"yuml", "\303\277"},
+	{"Yuml", "\305\270"},
+	{NULL, NULL}
+};
+
+static gchar* entity_extract_to_buffer(gchar *p, gchar b[])
+{
+	gint i = 0;
+
+	while (*p != '\0' && *p != ';' && i < ENTITY_MAX_LEN) {
+		b[i] = *p;
+		++i, ++p;
+	}
+	if (*p != ';' || i == ENTITY_MAX_LEN)
+		return NULL;
+	b[i] = '\0';
+
+	return b;
+}
+
+static gchar *entity_decode_numeric(gchar *str)
+{
+	gchar b[ENTITY_MAX_LEN];
+	gchar *p = str, *res;
+	gboolean hex = FALSE;
+	gunichar c;
+
+	++p;
+	if (*p == '\0')
+		return NULL;
+
+	if (*p == 'x') {
+		hex = TRUE;
+		++p;
+		if (*p == '\0')
+			return NULL;
+	}
+
+	if (entity_extract_to_buffer (p, b) == NULL)
+		return NULL;
+
+	c = g_ascii_strtoll (b, NULL, (hex? 16: 10));
+	res = g_malloc0 (DECODED_MAX_LEN + 1);
+	g_unichar_to_utf8 (c, res);
+
+	return res;
+}
+
+static gchar *entity_decode_symbol(gchar *str)
+{
+	gchar b[ENTITY_MAX_LEN];
+	gchar *decoded;
+
+	if (entity_extract_to_buffer (str, b) == NULL)
+		return NULL;
+
+	if (symbol_table == NULL) {
+		gint i;
+
+		symbol_table = g_hash_table_new (g_str_hash, g_str_equal);
+		for (i = 0; symbolic_entities[i].key != NULL; ++i) {
+			g_hash_table_insert (symbol_table,
+				symbolic_entities[i].key, symbolic_entities[i].value);
+		}
+		debug_print("initialized entities table with %d symbols\n", i);
+	}
+
+	decoded = g_hash_table_lookup (symbol_table, b);
+	if (decoded != NULL)
+		return g_strdup (decoded);
+
+	return NULL;
+}
+
+gchar *entity_decode(gchar *str)
+{
+	gchar *p = str;
+	if (p == NULL || *p != '&')
+		return NULL;
+	++p;
+	if (*p == '\0')
+		return NULL;
+	if (*p == '#')
+		return entity_decode_numeric(p);
+	else
+		return entity_decode_symbol(p);
+}
diff --git a/src/entity.h b/src/entity.h
new file mode 100644
index 0000000..9e3e492
--- /dev/null
+++ b/src/entity.h
@@ -0,0 +1,33 @@
+/*
+ * Claws Mail -- a GTK+ based, lightweight, and fast e-mail client
+ * Copyright (C) 2017 Ricardo Mones and the Claws Mail team
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __ENTITY_H__
+#define __ENTITY_H__
+
+#include <glib.h>
+
+/*
+ * Try to decode the HTML entity pointed by str, whose first element
+ * must be the '&' character.
+ *
+ * Returns a newly-allocated string with the decoded entity or NULL
+ * on failure to decode (like an unknown or invalid entity).
+ * Returned strings must be freed with g_free().
+ */
+gchar *entity_decode(gchar *str);
+
+#endif /* __ENTITY_H__ */

-----------------------------------------------------------------------


hooks/post-receive
-- 
Claws Mail


More information about the Commits mailing list