import sys, re from lxml import etree as E fn = r"D:/install/oracle/B19306_01/server.102/b14219/e0.htm" #fn = r"D:/install/oracle/B19306_01/server.102/b14219/e12700.htm" def parse(fn): parser = E.XMLParser(remove_blank_text=True) x = E.parse(open(fn, "rb")) root = x.getroot() reg1 = re.compile('''(.*)''') reg2 = re.compile('''(.*)(.*)''', re.M) reg3 = re.compile('''(.*)(.*)''', re.M) regS = re.compile('''([^<>]*)([a-zA-Z0-9]*)(.*)$''', re.M) def clear_span(s): return s for el in x.getiterator(): try: if el.attrib["class"] == "msgentry": msgerror = "" msgexpl = "" msgaction = "" for inner in el.getiterator(): try: if inner.tag.find("span") and inner.attrib["class"] == "msg": s = E.tostring(inner) m = reg1.match(s) if m: msgerror = clear_span(m.group(1)) elif inner.tag.find("div") and inner.attrib["class"] == "msgexplankw": s = E.tostring(inner) m = reg2.match(s) if m: msgexpl = clear_span(m.group(2)) elif inner.tag.find("div") and inner.attrib["class"] == "msgactionkw": s = E.tostring(inner) m = reg3.match(s) if m: msgaction = clear_span(m.group(2)) print msgerror, '\t', msgexpl, '\t', msgaction break except Exception, e: #print e pass except Exception, e: pass parse(fn)