[PATCH 4/4] Introduce a fallback encoding (eg. for blobs)

Julius Plenz plenz at cis.fu-berlin.de
Tue Oct 30 14:07:18 CET 2012


Usually you'll want to deliver the web pages using UTF-8. It's no
problem to convert Git's commit information to the PAGE_ENCODING since
if it's not UTF-8, the encoding that was used is specified.

In the case of blobs, Git by design doesn't want to know anything about
the encoding. But to make the file appear "normal" in the browser, the
FALLBACK_ENCODING (default: latin1) has a hint as to from which
encoding the string originates. In case the plain file is delivered, the
encoding will be set to the fallback specified if the blob's contents
are not valid UTF-8.

The same applies to the "Tagger" information, since tag objects don't
have an "encoding" field. (See:
http://git.661346.n2.nabble.com/PATCH-RFC-Document-format-of-basic-Git-objects-tp7287428p7288762.html )

Signed-off-by: Julius Plenz <plenz at cis.fu-berlin.de>
---
 cgit.h      |  4 +++-
 parsing.c   | 11 +++++++++++
 ui-diff.c   |  2 +-
 ui-plain.c  |  2 ++
 ui-refs.c   |  2 +-
 ui-ssdiff.c | 10 +++++-----
 ui-tag.c    |  4 ++--
 ui-tree.c   |  2 +-
 8 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/cgit.h b/cgit.h
index 4a3f528..fb7b9db 100644
--- a/cgit.h
+++ b/cgit.h
@@ -43,9 +43,11 @@
 
 
 /*
- * Default encoding
+ * Default encoding and fallback encoding in case blobs are not valid UTF-8
  */
 #define PAGE_ENCODING "UTF-8"
+#define FALLBACK_ENCODING "latin1"
+extern const char *to_pageencoding(const char *txt);
 
 typedef void (*configfn)(const char *name, const char *value);
 typedef void (*filepair_fn)(struct diff_filepair *pair);
diff --git a/parsing.c b/parsing.c
index 602e3de..2a03b11 100644
--- a/parsing.c
+++ b/parsing.c
@@ -98,6 +98,7 @@ char *parse_user(char *t, char **name, char **email, unsigned long *date)
 
 #ifdef NO_ICONV
 #define reencode(a, b, c)
+#define to_pageencoding(a)
 #else
 const char *reencode(char **txt, const char *src_enc, const char *dst_enc)
 {
@@ -120,6 +121,16 @@ const char *reencode(char **txt, const char *src_enc, const char *dst_enc)
 	}
 	return *txt;
 }
+const char *to_pageencoding(const char *txt)
+{
+	if(is_encoding_utf8(PAGE_ENCODING) && !is_utf8(txt)) {
+		char *tmp = xstrdup(txt);
+		reencode(&tmp, FALLBACK_ENCODING, PAGE_ENCODING);
+		return tmp;
+	}
+	return txt;
+}
+
 #endif
 
 struct commitinfo *cgit_parse_commit(struct commit *commit)
diff --git a/ui-diff.c b/ui-diff.c
index c6bad63..2d90a46 100644
--- a/ui-diff.c
+++ b/ui-diff.c
@@ -211,7 +211,7 @@ static void print_line(char *line, int len)
 
 	htmlf("<div class='%s'>", class);
 	line[len-1] = '\0';
-	html_txt(line);
+	html_txt(to_pageencoding(line));
 	html("</div>");
 	line[len-1] = c;
 }
diff --git a/ui-plain.c b/ui-plain.c
index 85877d7..baa5a2f 100644
--- a/ui-plain.c
+++ b/ui-plain.c
@@ -95,6 +95,8 @@ static void print_object(const unsigned char *sha1, const char *path)
 	ctx.page.filename = fmt("%s", path);
 	ctx.page.size = size;
 	ctx.page.etag = sha1_to_hex(sha1);
+	if(is_encoding_utf8(PAGE_ENCODING) && !is_utf8(buf)) /* best guess */
+		ctx.page.charset = FALLBACK_ENCODING;
 	cgit_print_http_headers(&ctx);
 	html_raw(buf, size);
 	match = 1;
diff --git a/ui-refs.c b/ui-refs.c
index caddfbc..15cfe0b 100644
--- a/ui-refs.c
+++ b/ui-refs.c
@@ -143,7 +143,7 @@ static int print_tag(struct refinfo *ref)
 			cgit_object_link(tag->tagged);
 		html("</td><td>");
 		if (info->tagger)
-			html(info->tagger);
+			html(to_pageencoding(info->tagger));
 		html("</td><td colspan='2'>");
 		if (info->tagger_date > 0)
 			cgit_print_age(info->tagger_date, -1, NULL);
diff --git a/ui-ssdiff.c b/ui-ssdiff.c
index fbb46cf..a60112e 100644
--- a/ui-ssdiff.c
+++ b/ui-ssdiff.c
@@ -208,7 +208,7 @@ static void print_part_with_lcs(char *class, char *line, char *lcs)
 			htmlf("</span>");
 			j += 1;
 		}
-		html_txt(c);
+		html_txt(to_pageencoding(c));
 	}
 }
 
@@ -244,7 +244,7 @@ static void print_ssdiff_line(char *class,
 		if (lcs)
 			print_part_with_lcs("del", old_line, lcs);
 		else
-			html_txt(old_line);
+			html_txt(to_pageencoding(old_line));
 	}
 
 	html("</td>\n");
@@ -265,7 +265,7 @@ static void print_ssdiff_line(char *class,
 		if (lcs)
 			print_part_with_lcs("add", new_line, lcs);
 		else
-			html_txt(new_line);
+			html_txt(to_pageencoding(new_line));
 	}
 
 	html("</td></tr>");
@@ -379,11 +379,11 @@ void cgit_ssdiff_line_cb(char *line, int len)
 		current_old_line += 1;
 	} else if (line[0] == '@') {
 		html("<tr><td colspan='4' class='hunk'>");
-		html_txt(line);
+		html_txt(to_pageencoding(line));
 		html("</td></tr>");
 	} else {
 		html("<tr><td colspan='4' class='ctx'>");
-		html_txt(line);
+		html_txt(to_pageencoding(line));
 		html("</td></tr>");
 	}
 	line[len - 1] = c;
diff --git a/ui-tag.c b/ui-tag.c
index 39e4cb8..de88880 100644
--- a/ui-tag.c
+++ b/ui-tag.c
@@ -21,7 +21,7 @@ static void print_tag_content(char *buf)
 	p = strchr(buf, '\n');
 	if (p)
 		*p = '\0';
-	html_txt(buf);
+	html_txt(to_pageencoding(buf));
 	html("</div>");
 	if (p) {
 		html("<div class='commit-msg'>");
@@ -74,7 +74,7 @@ void cgit_print_tag(char *revname)
 		}
 		if (info->tagger) {
 			html("<tr><td>tagged by</td><td>");
-			html_txt(info->tagger);
+			html_txt(to_pageencoding(info->tagger));
 			if (info->tagger_email && !ctx.cfg.noplainemail) {
 				html(" ");
 				html_txt(info->tagger_email);
diff --git a/ui-tree.c b/ui-tree.c
index b1adcc7..35f1ad5 100644
--- a/ui-tree.c
+++ b/ui-tree.c
@@ -55,7 +55,7 @@ static void print_text_buffer(const char *name, char *buf, unsigned long size)
 	}
 
 	html("<td class='lines'><pre><code>");
-	html_txt(buf);
+	html_txt(to_pageencoding(buf));
 	html("</code></pre></td></tr></table>\n");
 }
 
-- 
1.7.12.3-zedat





More information about the CGit mailing list