Swift: parse HTML with libxml2

Sun, 14. Jul 2019

Categories: en Tags: iOS Swift libxml2 ShaarliOS HTML

My iOS sharing extension ShaarliOS💫 (sourcecode+tests) has to communicate with HTML backends. Scraping data from HTML forms, sending HTTP posts. Now I want to avoid parsing HTML myself, but luckily libxml2 HTMLParser does that and is available on iOS. Giants, shoulders, you know.

Also I want to avoid to hold the complete document in memory, let alone traverse a DOM when I can grab the few things I need from a sax parser while they fly by. I used to get the form data with C/Objective-C and xpath, OMG. Having as few own, non-Swift-managed code as possible for parsing arbitrary content from the internet seems desirable.

So libxml2 for html parsing, sax (streaming or push parser) for slim footprint and Swift for memory safety and ‘modern’ idioms (Closures!).

Let’s add libxml2 sax parsing mojo to the Xcode project, following The Red Queen Coder:

The actual parsing finally turned out simpler than expected.

Especially the closures wiring the sax callbacks in line 52 look nice, don’t they?

Happy parsing!


Addendum

Bridging.h

 1//
 2// Bridging header to access libxml2 html parsing from Swift.
 3// http://mro.name/ShaarliOS
 4//
 5// First adjust some settings as described by http://redqueencoder.com/wrapping-libxml2-for-swift/
 6//
 7// * add to Xcode build settings “Header Search Paths”:
 8//     $(SDKROOT)/usr/include/libxml2
 9// * add to Xcode build settings "Other Linker Flags."
10//     -lxml2
11//
12// Also: https://github.com/SonoPlot/Swift-libxml/blob/master/LibXMLWrapperExample/LibXMLWrapperExample/Bridging-Header.h
13
14#import <libxml/HTMLparser.h>
15#import <libxml/xmlerror.h>

HtmlFormParser.swift

  1//
  2//  HtmlFormParser.swift
  3//  http://mro.name/ShaarliOS
  4//
  5//  Created by Marcus Rohrmoser on 09.06.19.
  6//  Copyright © 2019 Marcus Rohrmoser mobile Software. All rights reserved.
  7//
  8
  9import Foundation
 10
 11typealias HtmlFormDict = [String:String]
 12
 13// uses libxml2 graceful html parsing
 14func findHtmlForms(_ body:Data?, _ encoding:String?) -> [String:HtmlFormDict] {
 15    return HtmlFormParser().parse(body)
 16}
 17
 18// turn a nil-terminated list of unwrapped name,value pairs into a dictionary.
 19// expand abbreviated (html5) attribute values.
 20internal func atts2dict(_ atts: (Int) -> String?) -> HtmlFormDict {
 21    var ret:HtmlFormDict = [:]
 22    var idx = 0
 23    while let name = atts(idx) {
 24        ret[name] = atts(idx+1) ?? name
 25        idx += 2
 26    }
 27    return ret
 28}
 29
 30// https://github.com/apple/swift-corelibs-foundation/blob/master/Foundation/XMLParser.swift#L33
 31private func decode(_ bytes:UnsafePointer<xmlChar>?) -> String? {
 32    guard let bytes = bytes else { return nil }
 33    guard let (str, _) = String.decodeCString(bytes, as:UTF8.self, repairingInvalidCodeUnits:false) else { return nil }
 34    return str
 35}
 36
 37private func me(_ ptr : UnsafeRawPointer?) -> HtmlFormParser {
 38    return Unmanaged<HtmlFormParser>.fromOpaque(ptr!).takeUnretainedValue()
 39}
 40
 41private class HtmlFormParser {
 42    private var forms : [String:HtmlFormDict] = [:]
 43    private var form : HtmlFormDict = [:]
 44    private var formName = ""
 45    private var textName = ""
 46    private var text = ""
 47
 48    func parse(_ data:Data?) -> [String:HtmlFormDict] {
 49        guard let data = data else { return [:] }
 50        var sax = htmlSAXHandler()
 51        sax.initialized = XML_SAX2_MAGIC
 52        sax.startElement = { me($0).startElement(name:$1, atts:$2) }
 53        sax.endElement = { me($0).endElement(name:$1) }
 54        sax.characters = { me($0).charactersFound(ch:$1, len:$2) }
 55        // handler.error = errorEncounteredSAX
 56
 57        // https://curl.haxx.se/libcurl/c/htmltitle.html
 58        // http://xmlsoft.org/html/libxml-HTMLparser.html#htmlParseChunk
 59        // https://stackoverflow.com/questions/41140050/parsing-large-xml-from-server-while-downloading-with-libxml2-in-swift-3
 60        // https://github.com/apple/swift-corelibs-foundation/blob/master/Foundation/XMLParser.swift#L524
 61        // http://redqueencoder.com/wrapping-libxml2-for-swift/ bzw. https://github.com/SonoPlot/Swift-libxml
 62        let ctxt = htmlCreatePushParserCtxt(&sax, Unmanaged.passUnretained(self).toOpaque(), "", 0, "", XML_CHAR_ENCODING_NONE)
 63        defer { htmlFreeParserCtxt(ctxt) }
 64        let _ = data.withUnsafeBytes { htmlParseChunk(ctxt, $0, Int32(data.count), 0) }
 65        htmlParseChunk(ctxt, "", 0, 1)
 66
 67        return forms
 68    }
 69
 70    private func startElement(name: UnsafePointer<xmlChar>? , atts:UnsafePointer<UnsafePointer<xmlChar>?>?) {
 71        guard let atts = atts else { return }
 72        // https://github.com/MaddTheSane/chmox/blob/3263ddf09276f6a47961cc4b87762f58b88772d0/CHMTableOfContents.swift#L75
 73        guard let nam_ = UnsafeRawPointer(name)?.assumingMemoryBound(to: Int8.self) else { return }
 74        if 0 != strcmp("form", nam_) && 0 != strcmp("input", nam_) && 0 != strcmp("textarea", nam_) {
 75            return
 76        }
 77        guard let elm = decode(name) else { return }
 78        let att = atts2dict({ decode(atts[$0]) })
 79        let nam = att["name"] ?? att["id"] ?? ""
 80        switch elm {
 81        case "form":
 82            formName = nam
 83            form = [:]
 84        case "textarea":
 85            textName = nam
 86            text = ""
 87        case "input":
 88            form[nam] = "checkbox" == att["type"]
 89                ? ("off" == att["checked"] ? nil : att["checked"])
 90                : att["value"]
 91        default:
 92            break
 93        }
 94    }
 95
 96    private func endElement(name:UnsafePointer<xmlChar>?) {
 97        // https://github.com/MaddTheSane/chmox/blob/3263ddf09276f6a47961cc4b87762f58b88772d0/CHMTableOfContents.swift#L75
 98        guard let nam_ = UnsafeRawPointer(name)?.assumingMemoryBound(to: Int8.self) else { return }
 99        if 0 != strcmp("form", nam_) && 0 != strcmp("input", nam_) && 0 != strcmp("textarea", nam_) {
100            return
101        }
102        let elm = decode(name)
103        switch elm {
104        case "form":
105            forms[formName] = form
106            formName = ""
107        case "textarea":
108            form[textName] = text
109            textName = ""
110        default:
111            break
112        }
113    }
114
115    private func charactersFound(ch: UnsafePointer<xmlChar>?, len: CInt) {
116        if (textName.isEmpty) {
117            return
118        }
119        let d = Data(bytes: ch!, count:Int(len)) // clamp
120        let s = String(data: d, encoding: .utf8) ?? "<utf8 decoding issue>"
121        text.append(s)
122    }
123}