package godown import ( "bytes" "fmt" "io" "regexp" "strings" "github.com/mattn/go-runewidth" "golang.org/x/net/html" ) func isChildOf(node *html.Node, name string) bool { node = node.Parent return node != nil && node.Type == html.ElementNode && strings.ToLower(node.Data) == name } func hasClass(node *html.Node, clazz string) bool { for _, attr := range node.Attr { if attr.Key == "class" { for _, c := range strings.Fields(attr.Val) { if c == clazz { return true } } } } return false } func attr(node *html.Node, key string) string { for _, attr := range node.Attr { if attr.Key == key { return attr.Val } } return "" } func br(node *html.Node, w io.Writer, option *Option) { node = node.PrevSibling if node == nil { return } switch node.Type { case html.TextNode: text := strings.Trim(node.Data, " \t") if text != "" && !strings.HasSuffix(text, "\n") { fmt.Fprint(w, "\n") } case html.ElementNode: switch strings.ToLower(node.Data) { case "br", "p", "ul", "ol", "div", "blockquote", "h1", "h2", "h3", "h4", "h5", "h6": fmt.Fprint(w, "\n") } } } func table(node *html.Node, w io.Writer, option *Option) { for tr := node.FirstChild; tr != nil; tr = tr.NextSibling { if tr.Type == html.ElementNode && strings.ToLower(tr.Data) == "tbody" { node = tr break } } var header bool var rows [][]string for tr := node.FirstChild; tr != nil; tr = tr.NextSibling { if tr.Type != html.ElementNode || strings.ToLower(tr.Data) != "tr" { continue } var cols []string if !header { for th := tr.FirstChild; th != nil; th = th.NextSibling { if th.Type != html.ElementNode || strings.ToLower(th.Data) != "th" { continue } var buf bytes.Buffer walk(th, &buf, 0, option) cols = append(cols, buf.String()) } if len(cols) > 0 { rows = append(rows, cols) header = true continue } } for td := tr.FirstChild; td != nil; td = td.NextSibling { if td.Type != html.ElementNode || strings.ToLower(td.Data) != "td" { continue } var buf bytes.Buffer walk(td, &buf, 0, option) cols = append(cols, buf.String()) } rows = append(rows, cols) } maxcol := 0 for _, cols := range rows { if len(cols) > maxcol { maxcol = len(cols) } } widths := make([]int, maxcol) for _, cols := range rows { for i := 0; i < maxcol; i++ { if i < len(cols) { width := runewidth.StringWidth(cols[i]) if widths[i] < width { widths[i] = width } } } } for i, cols := range rows { for j := 0; j < maxcol; j++ { fmt.Fprint(w, "|") if j < len(cols) { width := runewidth.StringWidth(cols[j]) fmt.Fprint(w, cols[j]) fmt.Fprint(w, strings.Repeat(" ", widths[j]-width)) } else { fmt.Fprint(w, strings.Repeat(" ", widths[j])) } } fmt.Fprint(w, "|\n") if i == 0 && header { for j := 0; j < maxcol; j++ { fmt.Fprint(w, "|") fmt.Fprint(w, strings.Repeat("-", widths[j])) } fmt.Fprint(w, "|\n") } } fmt.Fprint(w, "\n") } var emptyElements = []string{ "area", "base", "br", "col", "embed", "hr", "img", "input", "keygen", "link", "meta", "param", "source", "track", "wbr", } func raw(node *html.Node, w io.Writer, option *Option) { switch node.Type { case html.ElementNode: fmt.Fprintf(w, "<%s", node.Data) for _, attr := range node.Attr { fmt.Fprintf(w, " %s=%q", attr.Key, attr.Val) } found := false tag := strings.ToLower(node.Data) for _, e := range emptyElements { if e == tag { found = true break } } if found { fmt.Fprint(w, "/>") } else { fmt.Fprint(w, ">") for c := node.FirstChild; c != nil; c = c.NextSibling { raw(c, w, option) } fmt.Fprintf(w, "</%s>", node.Data) } case html.TextNode: fmt.Fprint(w, node.Data) } } func bq(node *html.Node, w io.Writer, option *Option) { if node.Type == html.TextNode { fmt.Fprint(w, strings.Replace(node.Data, "\u00a0", " ", -1)) } else { for c := node.FirstChild; c != nil; c = c.NextSibling { bq(c, w, option) } } } func pre(node *html.Node, w io.Writer, option *Option) { if node.Type == html.TextNode { fmt.Fprint(w, node.Data) } else { for c := node.FirstChild; c != nil; c = c.NextSibling { pre(c, w, option) } } } func walk(node *html.Node, w io.Writer, nest int, option *Option) { if node.Type == html.TextNode { if strings.TrimSpace(node.Data) != "" { text := regexp.MustCompile(`[[:space:]][[:space:]]*`).ReplaceAllString(strings.Trim(node.Data, "\t\r\n"), " ") fmt.Fprint(w, text) } } n := 0 for c := node.FirstChild; c != nil; c = c.NextSibling { switch c.Type { case html.CommentNode: fmt.Fprint(w, "<!--") fmt.Fprint(w, c.Data) fmt.Fprint(w, "-->\n") case html.ElementNode: switch strings.ToLower(c.Data) { case "a": fmt.Fprint(w, "[") walk(c, w, nest, option) fmt.Fprint(w, "]("+attr(c, "href")+")") case "b", "strong": fmt.Fprint(w, "**") walk(c, w, nest, option) fmt.Fprint(w, "**") case "i", "em": fmt.Fprint(w, "_") walk(c, w, nest, option) fmt.Fprint(w, "_") case "del": fmt.Fprint(w, "~~") walk(c, w, nest, option) fmt.Fprint(w, "~~") case "br": br(c, w, option) fmt.Fprint(w, "\n\n") case "p": br(c, w, option) walk(c, w, nest, option) br(c, w, option) fmt.Fprint(w, "\n\n") case "code": if !isChildOf(c, "pre") { fmt.Fprint(w, "`") pre(c, w, option) fmt.Fprint(w, "`") } case "pre": br(c, w, option) var buf bytes.Buffer pre(c, &buf, option) var lang string if option != nil && option.GuessLang != nil { if guess, err := option.GuessLang(buf.String()); err == nil { lang = guess } } fmt.Fprint(w, "```"+lang+"\n") fmt.Fprint(w, buf.String()) if !strings.HasSuffix(buf.String(), "\n") { fmt.Fprint(w, "\n") } fmt.Fprint(w, "```\n\n") case "div": br(c, w, option) walk(c, w, nest, option) fmt.Fprint(w, "\n") case "blockquote": br(c, w, option) var buf bytes.Buffer if hasClass(c, "code") { bq(c, &buf, option) var lang string if option != nil && option.GuessLang != nil { if guess, err := option.GuessLang(buf.String()); err == nil { lang = guess } } fmt.Fprint(w, "```"+lang+"\n") fmt.Fprint(w, strings.TrimLeft(buf.String(), "\n")) if !strings.HasSuffix(buf.String(), "\n") { fmt.Fprint(w, "\n") } fmt.Fprint(w, "```\n\n") } else { walk(c, &buf, nest+1, option) if lines := strings.Split(strings.TrimSpace(buf.String()), "\n"); len(lines) > 0 { for _, l := range lines { fmt.Fprint(w, "> "+strings.TrimSpace(l)+"\n") } fmt.Fprint(w, "\n") } } case "ul", "ol": br(c, w, option) var buf bytes.Buffer walk(c, &buf, 1, option) if lines := strings.Split(strings.TrimSpace(buf.String()), "\n"); len(lines) > 0 { for i, l := range lines { if i > 0 || nest > 0 { fmt.Fprint(w, "\n") } fmt.Fprint(w, strings.Repeat(" ", nest)+strings.TrimSpace(l)) } fmt.Fprint(w, "\n") } case "li": br(c, w, option) if isChildOf(c, "ul") { fmt.Fprint(w, "* ") } else if isChildOf(c, "ol") { n++ fmt.Fprint(w, fmt.Sprintf("%d. ", n)) } walk(c, w, nest, option) fmt.Fprint(w, "\n") case "h1", "h2", "h3", "h4", "h5", "h6": br(c, w, option) fmt.Fprint(w, strings.Repeat("#", int(rune(c.Data[1])-rune('0')))+" ") walk(c, w, nest, option) fmt.Fprint(w, "\n\n") case "img": fmt.Fprint(w, "!["+attr(c, "alt")+"]("+attr(c, "src")+")") case "hr": br(c, w, option) fmt.Fprint(w, "\n---\n\n") case "table": br(c, w, option) table(c, w, option) case "style": if option != nil && option.Style { br(c, w, option) raw(c, w, option) fmt.Fprint(w, "\n\n") } case "script": if option != nil && option.Script { br(c, w, option) raw(c, w, option) fmt.Fprint(w, "\n\n") } default: if option == nil || option.CustomRules == nil { walk(c, w, nest, option) break } foundCustom := false for _, cr := range option.CustomRules { if tag, customWalk := cr.Rule(walk); strings.ToLower(c.Data) == tag { customWalk(c, w, nest, option) foundCustom = true } } if foundCustom { break } walk(c, w, nest, option) } default: walk(c, w, nest, option) } } } // WalkFunc type is an signature for functions traversing HTML nodes type WalkFunc func(node *html.Node, w io.Writer, nest int, option *Option) // CustomRule is an interface to define custom conversion rules // // Rule method accepts `next WalkFunc` as an argument, which `customRule` should call // to let walk function continue parsing the content inside the HTML tag. // It returns a tagName to indicate what HTML element this `customRule` handles and the `customRule` // function itself, where conversion logic should reside. // // See example TestRule implementation in godown_test.go type CustomRule interface { Rule(next WalkFunc) (tagName string, customRule WalkFunc) } // Option is optional information for Convert. type Option struct { GuessLang func(string) (string, error) Script bool Style bool CustomRules []CustomRule } // Convert convert HTML to Markdown. Read HTML from r and write to w. func Convert(w io.Writer, r io.Reader, option *Option) error { doc, err := html.Parse(r) if err != nil { return err } walk(doc, w, 0, option) fmt.Fprint(w, "\n") return nil }