Main Page | Class List | Directories | File List | Class Members | File Members

sgmlop.c

Go to the documentation of this file.
00001 /*
00002  * SGMLOP
00003  * $Id: sgmlop.c,v 1.14 2002/07/31 06:04:31 loewis Exp $
00004  *
00005  * The sgmlop accelerator module
00006  *
00007  * This module provides a FastSGMLParser type, which is designed to
00008  * speed up the standard sgmllib and xmllib modules.  The parser can
00009  * be configured to support either basic SGML (enough of it to process
00010  * HTML documents, at least) or XML.  This module also provides an
00011  * Element type, useful for fast but simple DOM implementations.
00012  *
00013  * History:
00014  * 1998-04-04 fl  Created (for coreXML)
00015  * 1998-04-05 fl  Added close method
00016  * 1998-04-06 fl  Added parse method, revised callback interface
00017  * 1998-04-14 fl  Fixed parsing of PI tags
00018  * 1998-05-14 fl  Cleaned up for first public release
00019  * 1998-05-19 fl  Fixed xmllib compatibility: handle_proc, handle_special
00020  * 1998-05-22 fl  Added attribute parser
00021  * 1999-06-20 fl  Added Element data type, various bug fixes.
00022  * 2000-05-28 fl  Fixed data truncation error (@SGMLOP1)
00023  * 2000-05-28 fl  Added temporary workaround for unicode problem (@SGMLOP2)
00024  * 2000-05-28 fl  Removed optional close argument (@SGMLOP3)
00025  * 2000-05-28 fl  Raise exception on recursive feed (@SGMLOP4)
00026  * 2000-07-05 fl  Fixed attribute handling in empty tags (@SGMLOP6)
00027  * 2001-12-28 wd  Add XMLUnicodeParser
00028  * 2001-12-31 mvl Properly process large character references
00029  *
00030  * Copyright (c) 1998-2000 by Secret Labs AB
00031  * Copyright (c) 1998-2000 by Fredrik Lundh
00032  * 
00033  * fredrik@pythonware.com
00034  * http://www.pythonware.com
00035  *
00036  * By obtaining, using, and/or copying this software and/or its
00037  * associated documentation, you agree that you have read, understood,
00038  * and will comply with the following terms and conditions:
00039  * 
00040  * Permission to use, copy, modify, and distribute this software and its
00041  * associated documentation for any purpose and without fee is hereby
00042  * granted, provided that the above copyright notice appears in all
00043  * copies, and that both that copyright notice and this permission notice
00044  * appear in supporting documentation, and that the name of Secret Labs
00045  * AB or the author not be used in advertising or publicity pertaining to
00046  * distribution of the software without specific, written prior
00047  * permission.
00048  * 
00049  * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
00050  * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
00051  * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
00052  * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
00053  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
00054  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
00055  * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.  */
00056 
00057 #include "Python.h"
00058 
00059 #include <ctype.h>
00060 
00061 #if (PY_MAJOR_VERSION == 1 && PY_MINOR_VERSION > 5) || (PY_MAJOR_VERSION == 2 && PY_MINOR_VERSION < 2)
00062 /* In Python 1.6, 2.0 and  2.1, disabling Unicode was not possible. */
00063 #define Py_USING_UNICODE
00064 #define PyUnicode_GetMax()  (0xffff)
00065 #endif
00066 
00067 #ifdef SGMLOP_UNICODE_SUPPORT
00068 /* wide character set (experimental) */
00069 /* FIXME: under Python 1.6, the current version converts Unicode
00070    strings to UTF-8, and parses the result as if it was an ASCII
00071    string. */
00072 #define CHAR_T  Py_UNICODE
00073 #define ISALNUM Py_UNICODE_ISALNUM
00074 #define ISSPACE Py_UNICODE_ISSPACE
00075 #define TOLOWER Py_UNICODE_TOLOWER
00076 #else
00077 /* 8-bit character set */
00078 #define CHAR_T  char
00079 #define ISALNUM isalnum
00080 #define ISSPACE isspace
00081 #define TOLOWER tolower
00082 #endif
00083 
00084 #if 0
00085 static int memory = 0;
00086 #define ALLOC(size, comment)\
00087 do { memory += size; printf("%8d - %s\n", memory, comment); } while (0)
00088 #define RELEASE(size, comment)\
00089 do { memory -= size; printf("%8d - %s\n", memory, comment); } while (0)
00090 #else
00091 #define ALLOC(size, comment)
00092 #define RELEASE(size, comment)
00093 #endif
00094 
00095 /* ==================================================================== */
00096 /* parser data type */
00097 
00098 /* state flags */
00099 #define MAYBE 1
00100 #define SURE 2
00101 
00102 /* parser type definition */
00103 typedef struct {
00104     PyObject_HEAD
00105 
00106     /* mode flags */
00107     int xml; /* 0=sgml/html 1=xml */
00108     int unicode; /* 0=8bit strings 1=unicode objects */
00109     char *encoding;
00110 
00111     /* state attributes */
00112     int feed;
00113     int shorttag; /* 0=normal 2=parsing shorttag */
00114     int doctype; /* 0=normal 1=dtd pending 2=parsing dtd */
00115 
00116     /* buffer (holds incomplete tags) */
00117     char* buffer;
00118     int bufferlen; /* current amount of data */
00119     int buffertotal; /* actually allocated */
00120 
00121     /* callbacks */
00122     PyObject* finish_starttag;
00123     PyObject* finish_endtag;
00124     PyObject* handle_proc;
00125     PyObject* handle_special;
00126     PyObject* handle_charref;
00127     PyObject* handle_entityref;
00128     PyObject* handle_data;
00129     PyObject* handle_cdata;
00130     PyObject* handle_comment;
00131 
00132 } FastSGMLParserObject;
00133 
00134 staticforward PyTypeObject FastSGMLParser_Type;
00135 
00136 /* forward declarations */
00137 static int fastfeed(FastSGMLParserObject* self);
00138 static PyObject* attrparse(FastSGMLParserObject* self, const CHAR_T* p, int len);
00139 static int fetchEncoding(FastSGMLParserObject* self, const CHAR_T* data, int len);
00140 static PyObject* stringFromData(FastSGMLParserObject* self, const CHAR_T* data, int len);
00141 static int callWithString(FastSGMLParserObject* self, PyObject* callback, const CHAR_T* data, int len);
00142 static int callWith2Strings(FastSGMLParserObject* self, PyObject* callback, const CHAR_T* data1, int len1, const CHAR_T* data2, int len2);
00143 static int callWithStringAndObj(FastSGMLParserObject* self, PyObject* callback, const CHAR_T* data, int len, PyObject* obj);
00144 
00145 #define callHandleData(self, data, len) callWithString((self), (self)->handle_data, (data), (len))
00146 #define callHandleCData(self, data, len) callWithString((self), (self)->handle_cdata, (data), (len))
00147 #define callHandleComment(self, data, len) callWithString((self), (self)->handle_comment, (data), (len))
00148 #define callHandleEntityRef(self, data, len) callWithString((self), (self)->handle_entityref, (data), (len))
00149 #define callHandleCharRef(self, data, len) callWithString((self), (self)->handle_charref, (data), (len))
00150 #define callHandleSpecial(self, data, len) callWithString((self), (self)->handle_special, (data), (len))
00151 #define callHandleProc(self, data1, len1, data2, len2) callWith2Strings((self), (self)->handle_proc, (data1), (len1), (data2), (len2))
00152 #define callFinishStartTag(self, data, len, obj) callWithStringAndObj((self), (self)->finish_starttag, (data), (len), (obj))
00153 #define callFinishEndTag(self, data, len) callWithString((self), (self)->finish_endtag, (data), (len))
00154 
00155 /* -------------------------------------------------------------------- */
00156 /* create parser */
00157 
00158 static PyObject*
00159 _sgmlop_new(int xml, int unicode)
00160 {
00161     FastSGMLParserObject* self;
00162 
00163     self = PyObject_NEW(FastSGMLParserObject, &FastSGMLParser_Type);
00164     if (self == NULL)
00165         return NULL;
00166 
00167     self->xml = xml;
00168     self->unicode = unicode;
00169     self->encoding = NULL;
00170 
00171     self->feed = 0;
00172     self->shorttag = 0;
00173     self->doctype = 0;
00174 
00175     self->buffer = NULL;
00176     self->bufferlen = 0;
00177     self->buffertotal = 0;
00178 
00179     self->finish_starttag = NULL;
00180     self->finish_endtag = NULL;
00181     self->handle_proc = NULL;
00182     self->handle_special = NULL;
00183     self->handle_charref = NULL;
00184     self->handle_entityref = NULL;
00185     self->handle_data = NULL;
00186     self->handle_cdata = NULL;
00187     self->handle_comment = NULL;
00188 
00189     return (PyObject*) self;
00190 }
00191 
00192 static PyObject*
00193 _sgmlop_sgmlparser(PyObject* self, PyObject* args)
00194 {
00195     if (!PyArg_NoArgs(args))
00196         return NULL;
00197 
00198     return _sgmlop_new(0, 0);
00199 }
00200 
00201 static PyObject*
00202 _sgmlop_xmlparser(PyObject* self, PyObject* args)
00203 {
00204     if (!PyArg_NoArgs(args))
00205         return NULL;
00206 
00207     return _sgmlop_new(1, 0);
00208 }
00209 
00210 static PyObject*
00211 _sgmlop_xmlunicodeparser(PyObject* self, PyObject* args)
00212 {
00213     if (!PyArg_NoArgs(args))
00214         return NULL;
00215 
00216     return _sgmlop_new(1, 1);
00217 }
00218 
00219 static void
00220 _sgmlop_dealloc(FastSGMLParserObject* self)
00221 {
00222     if (self->buffer)
00223         free(self->buffer);
00224     if (self->encoding)
00225         free(self->encoding);
00226     Py_XDECREF(self->finish_starttag);
00227     Py_XDECREF(self->finish_endtag);
00228     Py_XDECREF(self->handle_proc);
00229     Py_XDECREF(self->handle_special);
00230     Py_XDECREF(self->handle_charref);
00231     Py_XDECREF(self->handle_entityref);
00232     Py_XDECREF(self->handle_data);
00233     Py_XDECREF(self->handle_cdata);
00234     Py_XDECREF(self->handle_comment);
00235     PyMem_DEL(self);
00236 }
00237 
00238 #define GETCB(member, name)\
00239     Py_XDECREF(self->member);\
00240     self->member = PyObject_GetAttrString(item, name);
00241 
00242 static PyObject*
00243 _sgmlop_register(FastSGMLParserObject* self, PyObject* args)
00244 {
00245     /* register a callback object */
00246     PyObject* item;
00247     if (!PyArg_ParseTuple(args, "O", &item))
00248         return NULL;
00249 
00250     GETCB(finish_starttag, "finish_starttag");
00251     GETCB(finish_endtag, "finish_endtag");
00252     GETCB(handle_proc, "handle_proc");
00253     GETCB(handle_special, "handle_special");
00254     GETCB(handle_charref, "handle_charref");
00255     GETCB(handle_entityref, "handle_entityref");
00256     GETCB(handle_data, "handle_data");
00257     GETCB(handle_cdata, "handle_cdata");
00258     GETCB(handle_comment, "handle_comment");
00259 
00260     PyErr_Clear();
00261 
00262     Py_INCREF(Py_None);
00263     return Py_None;
00264 }
00265 
00266 
00267 /* -------------------------------------------------------------------- */
00268 /* feed data to parser.  the parser processes as much of the data as
00269    possible, and keeps the rest in a local buffer. */
00270 
00271 static PyObject*
00272 feed(FastSGMLParserObject* self, char* string, int stringlen, int last)
00273 {
00274     /* common subroutine for SGMLParser.feed and SGMLParser.close */
00275 
00276     int length;
00277 
00278     if (self->feed) {
00279         /* dealing with recursive feeds isn's exactly trivial, so
00280            let's just bail out before the parser messes things up */
00281         PyErr_SetString(PyExc_AssertionError, "recursive feed");
00282         return NULL;
00283     }
00284 
00285     /* append new text block to local buffer */
00286     if (!self->buffer) {
00287         length = stringlen;
00288         self->buffer = malloc(length);
00289         self->buffertotal = stringlen;
00290     } else {
00291         length = self->bufferlen + stringlen;
00292         if (length > self->buffertotal) {
00293             self->buffer = realloc(self->buffer, length);
00294             self->buffertotal = length;
00295         }
00296     }
00297     if (!self->buffer) {
00298         PyErr_NoMemory();
00299         return NULL;
00300     }
00301     memcpy(self->buffer + self->bufferlen, string, stringlen);
00302     self->bufferlen = length;
00303 
00304     self->feed = 1;
00305 
00306     length = fastfeed(self);
00307 
00308     self->feed = 0;
00309 
00310     if (length < 0)
00311         return NULL;
00312 
00313     if (length > self->bufferlen) {
00314         /* ran beyond the end of the buffer (internal error)*/
00315         PyErr_SetString(PyExc_AssertionError, "buffer overrun");
00316         return NULL;
00317     }
00318 
00319     if (length > 0 && length < self->bufferlen)
00320         /* adjust buffer */
00321         memmove(self->buffer, self->buffer + length,
00322                 self->bufferlen - length);
00323 
00324     self->bufferlen = self->bufferlen - length;
00325 
00326     /* FIXME: if data remains in the buffer even through this is the
00327        last call, do an extra handle_data to get rid of it */
00328 
00329     /* FIXME: if this is the last call, shut the parser down and
00330        release the internal buffers */
00331 
00332     return Py_BuildValue("i", self->bufferlen);
00333 }
00334 
00335 static PyObject*
00336 _sgmlop_feed(FastSGMLParserObject* self, PyObject* args)
00337 {
00338     /* feed a chunk of data to the parser */
00339 
00340     char* string;
00341     int stringlen;
00342     if (!PyArg_ParseTuple(args, "t#", &string, &stringlen))
00343         return NULL;
00344 
00345     return feed(self, string, stringlen, 0);
00346 }
00347 
00348 static PyObject*
00349 _sgmlop_close(FastSGMLParserObject* self, PyObject* args)
00350 {
00351     /* flush parser buffers */
00352 
00353     if (!PyArg_NoArgs(args))
00354         return NULL;
00355 
00356     return feed(self, "", 0, 1);
00357 }
00358 
00359 static PyObject*
00360 _sgmlop_parse(FastSGMLParserObject* self, PyObject* args)
00361 {
00362     /* feed a single chunk of data to the parser */
00363 
00364     char* string;
00365     int stringlen;
00366     if (!PyArg_ParseTuple(args, "t#", &string, &stringlen))
00367         return NULL;
00368 
00369     return feed(self, string, stringlen, 1);
00370 }
00371 
00372 
00373 /* -------------------------------------------------------------------- */
00374 /* type interface */
00375 
00376 static PyMethodDef _sgmlop_methods[] = {
00377     /* register callbacks */
00378     {"register", (PyCFunction) _sgmlop_register, 1},
00379     /* incremental parsing */
00380     {"feed", (PyCFunction) _sgmlop_feed, 1},
00381     {"close", (PyCFunction) _sgmlop_close, 0},
00382     /* one-shot parsing */
00383     {"parse", (PyCFunction) _sgmlop_parse, 1},
00384     {NULL, NULL}
00385 };
00386 
00387 static PyObject*  
00388 _sgmlop_getattr(FastSGMLParserObject* self, char* name)
00389 {
00390     return Py_FindMethod(_sgmlop_methods, (PyObject*) self, name);
00391 }
00392 
00393 statichere PyTypeObject FastSGMLParser_Type = {
00394     PyObject_HEAD_INIT(NULL)
00395     0, /* ob_size */
00396     "FastSGMLParser", /* tp_name */
00397     sizeof(FastSGMLParserObject), /* tp_size */
00398     0, /* tp_itemsize */
00399     /* methods */
00400     (destructor)_sgmlop_dealloc, /* tp_dealloc */
00401     0, /* tp_print */
00402     (getattrfunc)_sgmlop_getattr, /* tp_getattr */
00403     0 /* tp_setattr */
00404 };
00405 
00406 /* ==================================================================== */
00407 /* element data type */
00408 
00409 typedef struct {
00410     PyObject_HEAD
00411 
00412     /* an element has the following attributes: */
00413     PyObject* parent; /* back link (None for the root node) */
00414     PyObject* tag; /* element tag (a string) */
00415     PyObject* attrib; /* attributes (a dictionary object) */
00416     PyObject* text; /* text before first child */
00417     PyObject* suffix; /* text after this element, in parent */
00418 
00419     /* in addition, it can hold any number of child nodes: */
00420     int child_count; /* actual items */
00421     int child_total; /* allocated items */
00422     PyObject* *children;
00423 
00424     /* Note: the suffix attribute holds textual data that belongs to
00425        the parent.  on other words, each element represents the
00426        following XML snippet:
00427 
00428            "<tag attributes> text children </name> suffix"
00429 
00430        */
00431 
00432 } ElementObject;
00433 
00434 staticforward PyTypeObject Element_Type;
00435 
00436 /* -------------------------------------------------------------------- */
00437 /* element constructor and destructor */
00438 
00439 static PyObject*
00440 element_new(PyObject* _self, PyObject* args)
00441 {
00442     ElementObject* self;
00443 
00444     PyObject* parent;
00445     PyObject* tag;
00446     PyObject* attrib = Py_None;
00447     PyObject* text = Py_None;
00448     PyObject* suffix = Py_None;
00449     if (!PyArg_ParseTuple(args, "OO|OOO", &parent, &tag,
00450                           &attrib, &text, &suffix))
00451         return NULL;
00452 
00453     if (parent != Py_None && parent->ob_type != &Element_Type) {
00454         PyErr_SetString(PyExc_TypeError, "parent must be Element or None");
00455         return NULL;
00456     }
00457 
00458     self = PyObject_NEW(ElementObject, &Element_Type);
00459     if (self == NULL)
00460         return NULL;
00461 
00462     Py_INCREF(parent);
00463     self->parent = parent;
00464 
00465     Py_INCREF(tag);
00466     self->tag = tag;
00467 
00468     Py_INCREF(attrib);
00469     self->attrib = attrib;
00470 
00471     Py_INCREF(text);
00472     self->text = text;
00473 
00474     Py_INCREF(suffix);
00475     self->suffix = suffix;
00476 
00477     self->child_count = 0;
00478     self->child_total = 0;
00479     self->children = NULL;
00480 
00481     ALLOC(sizeof(ElementObject), "create element");
00482 
00483     return (PyObject*) self;
00484 }
00485 
00486 static void
00487 element_dealloc(ElementObject* self)
00488 {
00489     int i;
00490 
00491     /* FIXME: the parent attribute means that a tree will contain
00492        circular references.  this will be fixed ("how?" is the big
00493        question...) */
00494 
00495     if (self->children) {
00496         for (i = 0; i < self->child_count; i++)
00497             Py_DECREF(self->children[i]);
00498         free(self->children);
00499     }
00500 
00501     /* break the backlink */
00502     Py_DECREF(self->parent);
00503 
00504     /* discard attributes */
00505     Py_DECREF(self->tag);
00506     Py_XDECREF(self->attrib);
00507     Py_XDECREF(self->text);
00508     Py_XDECREF(self->suffix);
00509 
00510     RELEASE(sizeof(ElementObject), "destroy element");
00511 
00512     PyMem_DEL(self);
00513 }
00514 
00515 /* -------------------------------------------------------------------- */
00516 /* methods (in alphabetical order) */
00517 
00518 static PyObject*
00519 element_append(ElementObject* self, PyObject* args)
00520 {
00521     int total;
00522     
00523     PyObject* element;
00524     if (!PyArg_ParseTuple(args, "O!", &Element_Type, &element))
00525         return NULL;
00526 
00527     if (!self->children) {
00528         total = 10;
00529         self->children = malloc(total * sizeof(PyObject*));
00530         self->child_total = total;
00531     } else if (self->child_count >= self->child_total) {
00532         total = self->child_total + 10;
00533         self->children = realloc(self->children, total * sizeof(PyObject*));
00534         self->child_total = total;
00535     }
00536     if (!self->children) {
00537         PyErr_NoMemory();
00538         return NULL;
00539     }
00540 
00541     Py_INCREF(element);
00542     self->children[self->child_count++] = element;
00543 
00544     Py_INCREF(Py_None);
00545     return Py_None;
00546 }
00547 
00548 static PyObject*
00549 element_destroy(ElementObject* self, PyObject* args)
00550 {
00551     int i;
00552     PyObject* res;
00553     
00554     if (!PyArg_NoArgs(args))
00555         return NULL;
00556 
00557     /* break the backlink */
00558     if (self->parent != Py_None) {
00559         Py_DECREF(self->parent);
00560         self->parent = Py_None;
00561         Py_INCREF(self->parent);
00562     }
00563 
00564     /* destroy element children */
00565     if (self->children) {
00566         for (i = 0; i < self->child_count; i++) {
00567             res = element_destroy((ElementObject*) self->children[i], args);
00568             Py_DECREF(res);
00569             Py_DECREF(self->children[i]);
00570         }
00571         self->child_count = 0;
00572     }
00573 
00574     /* leave the rest to the garbage collector... */
00575 
00576     Py_INCREF(Py_None);
00577     return Py_None;
00578 }
00579 
00580 static PyObject *
00581 element_get(ElementObject* self, PyObject* args)
00582 {
00583     PyObject* value;
00584 
00585     PyObject* key;
00586     PyObject* default_value = Py_None;
00587     if (!PyArg_ParseTuple(args, "O|O", &key, &default_value))
00588         return NULL;
00589 
00590     value = PyDict_GetItem(self->attrib, key);
00591     if (!value) {
00592         value = default_value;
00593         PyErr_Clear();
00594     }
00595 
00596     Py_INCREF(value);
00597     return value;
00598 }
00599 
00600 static PyObject*
00601 element_getitem(ElementObject* self, int index)
00602 {
00603     if (index < 0 || index >= self->child_count) {
00604         PyErr_SetString(PyExc_IndexError, "child index out of range");
00605         return NULL;
00606     }
00607 
00608     Py_INCREF(self->children[index]);
00609     return self->children[index];
00610 }
00611 
00612 static int
00613 element_length(ElementObject* self)
00614 {
00615     return self->child_count;
00616 }
00617 
00618 static PyObject*
00619 element_repr(ElementObject* self)
00620 {
00621     char buf[300];
00622     if (PyString_Check(self->tag))
00623         sprintf(
00624             buf, "<Element object '%.256s' at %lx>",
00625             PyString_AsString(self->tag),
00626             (long) self
00627             );
00628     else
00629         sprintf(
00630             buf, "<Element object at %lx>",
00631             (long) self
00632             );
00633 
00634     return PyString_FromString(buf);
00635 }
00636 
00637 /* -------------------------------------------------------------------- */
00638 /* type descriptor */
00639 
00640 static PyMethodDef element_methods[] = {
00641     {"get", (PyCFunction) element_get, 1},
00642     {"append", (PyCFunction) element_append, 1},
00643     {"destroy", (PyCFunction) element_destroy, 0},
00644     {NULL, NULL}
00645 };
00646 
00647 static PyObject*  
00648 element_getattr(ElementObject* self, char* name)
00649 {
00650     PyObject* res;
00651 
00652     res = Py_FindMethod(element_methods, (PyObject*) self, name);
00653     if (res)
00654         return res;
00655 
00656     PyErr_Clear();
00657 
00658     if (strcmp(name, "tag") == 0)
00659         res = self->tag;
00660     else if (strcmp(name, "text") == 0)
00661         res = self->text;
00662     else if (strcmp(name, "suffix") == 0)
00663         res = self->suffix;
00664     else if (strcmp(name, "attrib") == 0)
00665         res = self->attrib;
00666     else if (strcmp(name, "parent") == 0)
00667         res = self->parent;
00668     else {
00669         PyErr_SetString(PyExc_AttributeError, name);
00670         return NULL;
00671     }
00672 
00673     Py_INCREF(res);
00674     return res;
00675 }
00676 
00677 static int
00678 element_setattr(ElementObject *self, const char* name, PyObject* value)
00679 {
00680     if (value == NULL) {
00681         PyErr_SetString(PyExc_AttributeError,
00682                         "can't delete element attributes");
00683         return -1;
00684     }
00685 
00686     if (strcmp(name, "text") == 0) {
00687 
00688         Py_DECREF(self->text);
00689         self->text = value;
00690         Py_INCREF(self->text);
00691 
00692     } else if (strcmp(name, "suffix") == 0) {
00693 
00694         Py_DECREF(self->suffix);
00695         self->suffix = value;
00696         Py_INCREF(self->suffix);
00697 
00698     } else if (strcmp(name, "attrib") == 0) {
00699 
00700         Py_DECREF(self->attrib);
00701         self->attrib = value;
00702         Py_INCREF(self->attrib);
00703 
00704     } else {
00705 
00706         PyErr_SetString(PyExc_AttributeError, name);
00707         return -1;
00708 
00709     }
00710 
00711     return 0;
00712 }
00713 
00714 static PySequenceMethods element_as_sequence = {
00715     (inquiry) element_length, /* sq_length */
00716     0, /* sq_concat */
00717     0, /* sq_repeat */
00718     (intargfunc) element_getitem, /* sq_item */
00719     0, /* sq_slice */
00720     0, /* sq_ass_item */
00721     0, /* sq_ass_slice */
00722 };
00723 
00724 statichere PyTypeObject Element_Type = {
00725     PyObject_HEAD_INIT(NULL)
00726     0, /* ob_size */
00727     "Element", /* tp_name */
00728     sizeof(ElementObject), /*tp_size*/
00729     0, /* tp_itemsize */
00730     /* methods */
00731     (destructor)element_dealloc, /* tp_dealloc */
00732     0, /* tp_print */
00733     (getattrfunc)element_getattr, /* tp_getattr */
00734     (setattrfunc)element_setattr, /* tp_setattr */
00735     0, /* tp_compare */
00736     (reprfunc)element_repr, /* tp_repr */
00737     0, /* tp_as_number */
00738     &element_as_sequence, /* tp_as_sequence */
00739     0 /* tp_as_mapping */
00740 };
00741 
00742 
00743 /* ==================================================================== */
00744 /* tree builder (not yet implemented) */
00745 
00746 typedef struct {
00747     PyObject_HEAD
00748 
00749     PyObject* root; /* root node (first created node) */
00750 
00751     PyObject* this; /* current node */
00752     PyObject* last; /* most recently created node */
00753     PyObject* data; /* data collector */
00754 
00755 } TreeBuilderObject;
00756 
00757 staticforward PyTypeObject TreeBuilder_Type;
00758 
00759 /* -------------------------------------------------------------------- */
00760 /* constructor and destructor */
00761 
00762 static PyObject*
00763 treebuilder_new(PyObject* _self, PyObject* args)
00764 {
00765     TreeBuilderObject* self;
00766 
00767     /* no arguments */
00768     if (!PyArg_NoArgs(args))
00769         return NULL;
00770 
00771     self = PyObject_NEW(TreeBuilderObject, &TreeBuilder_Type);
00772     if (self == NULL)
00773         return NULL;
00774 
00775     Py_INCREF(Py_None);
00776     self->root = Py_None;
00777 
00778     self->this = NULL;
00779     self->last = NULL;
00780     self->data = NULL;
00781 
00782     return (PyObject*) self;
00783 }
00784 
00785 static void
00786 treebuilder_dealloc(TreeBuilderObject* self)
00787 {
00788     Py_XDECREF(self->data);
00789     Py_XDECREF(self->last);
00790     Py_XDECREF(self->this);
00791     Py_DECREF(self->root);
00792     PyMem_DEL(self);
00793 }
00794 
00795 /* -------------------------------------------------------------------- */
00796 /* methods (in alphabetical order) */
00797 
00798 static PyObject*
00799 treebuilder_start(TreeBuilderObject* self, PyObject* args)
00800 {
00801     PyObject* tag;
00802     PyObject* attrib = Py_None;
00803     if (!PyArg_ParseTuple(args, "O|O", &tag, &attrib))
00804         return NULL;
00805 
00806     /* create a new node */
00807 
00808     Py_INCREF(Py_None);
00809     return Py_None;
00810 }
00811 
00812 static PyObject*
00813 treebuilder_end(TreeBuilderObject* self, PyObject* args)
00814 {
00815     PyObject* tag;
00816     if (!PyArg_ParseTuple(args, "O", &tag))
00817         return NULL;
00818 
00819     /* end current node */
00820 
00821     Py_INCREF(Py_None);
00822     return Py_None;
00823 }
00824 
00825 static PyObject *
00826 treebuilder_data(TreeBuilderObject* self, PyObject* args)
00827 {
00828     PyObject* data;
00829     if (!PyArg_ParseTuple(args, "O", &data))
00830         return NULL;
00831 
00832     /* add data to collector */
00833 
00834     Py_INCREF(Py_None);
00835     return Py_None;
00836 }
00837 
00838 /* -------------------------------------------------------------------- */
00839 /* type descriptor */
00840 
00841 static PyMethodDef treebuilder_methods[] = {
00842     {"data", (PyCFunction) treebuilder_data, 1},
00843     {"start", (PyCFunction) treebuilder_start, 1},
00844     {"end", (PyCFunction) treebuilder_end, 1},
00845     {NULL, NULL}
00846 };
00847 
00848 static PyObject*  
00849 treebuilder_getattr(ElementObject* self, char* name)
00850 {
00851     return Py_FindMethod(treebuilder_methods, (PyObject*) self, name);
00852 }
00853 
00854 statichere PyTypeObject TreeBuilder_Type = {
00855     PyObject_HEAD_INIT(NULL)
00856     0, /* ob_size */
00857     "TreeBuilder", /* tp_name */
00858     sizeof(TreeBuilderObject), /*tp_size*/
00859     0, /* tp_itemsize */
00860     /* methods */
00861     (destructor)treebuilder_dealloc, /* tp_dealloc */
00862     0, /* tp_print */
00863     (getattrfunc)treebuilder_getattr, /* tp_getattr */
00864     0, /* tp_setattr */
00865     0, /* tp_compare */
00866     0, /* tp_repr */
00867     0, /* tp_as_number */
00868     0, /* tp_as_sequence */
00869     0 /* tp_as_mapping */
00870 };
00871 
00872 
00873 /* ==================================================================== */
00874 /* python module interface */
00875 
00876 static PyMethodDef _functions[] = {
00877     {"SGMLParser", _sgmlop_sgmlparser, 0},
00878     {"XMLParser", _sgmlop_xmlparser, 0},
00879     {"XMLUnicodeParser", _sgmlop_xmlunicodeparser, 0},
00880     {"Element", element_new, 1},
00881     {"TreeBuilder", treebuilder_new, 0},
00882     {NULL, NULL}
00883 };
00884 
00885 DL_EXPORT(void)
00886 initsgmlop(void)
00887 {
00888     /* Patch object type */
00889     FastSGMLParser_Type.ob_type =
00890     Element_Type.ob_type =
00891     TreeBuilder_Type.ob_type = &PyType_Type;
00892 
00893     Py_InitModule("sgmlop", _functions);
00894 }
00895 
00896 /* -------------------------------------------------------------------- */
00897 /* the parser does it all in a single loop, keeping the necessary
00898    state in a few flag variables and the data buffer.  if you have
00899    a good optimizer, this can be incredibly fast. */
00900 
00901 #define TAG 0x100
00902 #define TAG_START 0x101
00903 #define TAG_END 0x102
00904 #define TAG_EMPTY 0x103
00905 #define DIRECTIVE 0x104
00906 #define DOCTYPE 0x105
00907 #define PI 0x106
00908 #define DTD_START 0x107
00909 #define DTD_END 0x108
00910 #define DTD_ENTITY 0x109
00911 #define CDATA 0x200
00912 #define ENTITYREF 0x400
00913 #define CHARREF 0x401
00914 #define COMMENT 0x800
00915 
00916 static int
00917 fastfeed(FastSGMLParserObject* self)
00918 {
00919     CHAR_T *end; /* tail */
00920     CHAR_T *p, *q, *s; /* scanning pointers */
00921     CHAR_T *b, *t, *e; /* token start/end */
00922 
00923     int token;
00924 
00925     s = q = p = (CHAR_T*) self->buffer;
00926     end = (CHAR_T*) (self->buffer + self->bufferlen);
00927 
00928     while (p < end) {
00929 
00930         q = p; /* start of token */
00931 
00932         if (*p == '<') {
00933             int has_attr;
00934 
00935             /* <tags> */
00936             token = TAG_START;
00937             if (++p >= end)
00938                 goto eol;
00939 
00940             if (*p == '!') {
00941                 /* <! directive */
00942                 if (++p >= end)
00943                     goto eol;
00944                 token = DIRECTIVE;
00945                 b = t = p;
00946                 if (*p == '-') {
00947                     /* <!-- comment --> */
00948                     token = COMMENT;
00949                     b = p + 2;
00950                     for (;;) {
00951                         if (p+3 >= end)
00952                             goto eol;
00953                         if (p[1] != '-')
00954                             p += 2; /* boyer moore, sort of ;-) */
00955                         else if (p[0] != '-' || p[2] != '>')
00956                             p++;
00957                         else
00958                             break;
00959                     }
00960                     e = p;
00961                     p += 3;
00962                     goto eot;
00963                 } else if (self->xml) {
00964                     /* FIXME: recognize <!ATTLIST data> ? */
00965                     /* FIXME: recognize <!ELEMENT data> ? */
00966                     /* FIXME: recognize <!ENTITY data> ? */
00967                     /* FIXME: recognize <!NOTATION data> ? */
00968                     if (*p == 'D' ) {
00969                         /* FIXME: make sure this really is a !DOCTYPE tag */
00970                         /* <!DOCTYPE data> or <!DOCTYPE data [ data ]> */
00971                         token = DOCTYPE;
00972                         self->doctype = MAYBE;
00973                     } else if (*p == '[') {
00974                         /* FIXME: make sure this really is a ![CDATA[ tag */
00975                         /* FIXME: recognize <![INCLUDE */
00976                         /* FIXME: recognize <![IGNORE */
00977                         /* <![CDATA[data]]> */
00978                         token = CDATA;
00979                         b = t = p + 7;
00980                         for (;;) {
00981                             if (p+3 >= end)
00982                                 goto eol;
00983                             if (p[1] != ']')
00984                                 p += 2;
00985                             else if (p[0] != ']' || p[2] != '>')
00986                                 p++;
00987                             else
00988                                 break;
00989                         }
00990                         e = p;
00991                         p += 3;
00992                         goto eot;
00993                     }
00994                 }
00995             } else if (*p == '?') {
00996                 token = PI;
00997                 if (++p >= end)
00998                     goto eol;
00999             } else if (*p == '/') {
01000                 /* </endtag> */
01001                 token = TAG_END;
01002                 if (++p >= end)
01003                     goto eol;
01004             }
01005 
01006             /* process tag name */
01007             b = p;
01008             if (!self->xml)
01009                 while (ISALNUM(*p) || *p == '-' || *p == '.' ||
01010                        *p == ':' || *p == '?') {
01011                     *p = (CHAR_T) TOLOWER(*p);
01012                     if (++p >= end)
01013                         goto eol;
01014                 }
01015             else
01016                 while (ISALNUM(*p) || *p == '-' || *p == '.' || *p == '_' ||
01017                        *p == ':' || *p == '?') {
01018                     if (++p >= end)
01019                         goto eol;
01020                 }
01021 
01022             t = p;
01023 
01024             has_attr = 0;
01025 
01026             if (*p == '/' && !self->xml) {
01027                 /* <tag/data/ or <tag/> */
01028                 token = TAG_START;
01029                 e = p;
01030                 if (++p >= end)
01031                     goto eol;
01032                 if (*p == '>') {
01033                     /* <tag/> */
01034                     token = TAG_EMPTY;
01035                     if (++p >= end)
01036                         goto eol;
01037                 } else
01038                     /* <tag/data/ */
01039                     self->shorttag = SURE;
01040                     /* we'll generate an end tag when we stumble upon
01041                        the end slash */
01042 
01043             } else {
01044 
01045                 /* skip attributes */
01046                 int quote = 0;
01047                 int last = 0;
01048                 if (token==PI && self->xml) {
01049                     int found = 0;
01050                     while ((*p!='>') || (!found)) {
01051                         found = (*p=='?');
01052                         if (++p >= end)
01053                             goto eol;
01054                     }
01055                     last = '?';
01056                 }
01057                 else {
01058                     while (*p != '>' || quote) {
01059                         if (!ISSPACE(*p)) {
01060                             has_attr = 1;
01061                             /* FIXME: note: end tags cannot have attributes! */
01062                         }
01063                         if (quote) {
01064                             if (*p == quote)
01065                                 quote = 0;
01066                         } else {
01067                             if (*p == '"' || *p == '\'')
01068                                 quote = *p;
01069                         }
01070                         if (*p == '[' && !quote && self->doctype) {
01071                             self->doctype = SURE;
01072                             token = DTD_START;
01073                             e = p++;
01074                             goto eot;
01075                         }
01076                         last = *p;
01077                         if (++p >= end)
01078                             goto eol;
01079                     }
01080                 }
01081 
01082                 e = p++;
01083 
01084                 if (last == '/') {
01085                     /* <tag/> */
01086                     e--;
01087                     token = TAG_EMPTY;
01088                 } else if (token == PI && last == '?')
01089                     e--;
01090 
01091                 if (self->doctype == MAYBE)
01092                     self->doctype = 0; /* there was no dtd */
01093 
01094                 if (has_attr)
01095                     ; /* FIXME: process attributes */
01096 
01097             }
01098 
01099         } else if (*p == '/' && self->shorttag) {
01100 
01101             /* end of shorttag. this generates an empty end tag */
01102             token = TAG_END;
01103             self->shorttag = 0;
01104             b = t = e = p;
01105             if (++p >= end)
01106                 goto eol;
01107 
01108         } else if (*p == ']' && self->doctype) {
01109 
01110             /* end of dtd. this generates an empty end tag */
01111             token = DTD_END;
01112             /* FIXME: who handles the ending > !? */
01113             b = t = e = p;
01114             if (++p >= end)
01115                 goto eol;
01116             self->doctype = 0;
01117 
01118         } else if (*p == '%' && self->doctype) {
01119 
01120             /* doctype entities */
01121             token = DTD_ENTITY;
01122             if (++p >= end)
01123                 goto eol;
01124             b = t = p;
01125             while (ISALNUM(*p) || *p == '.')
01126                 if (++p >= end)
01127                     goto eol;
01128             e = p;
01129             if (*p == ';')
01130                 p++;
01131 
01132         } else if (*p == '&') {
01133 
01134             /* entities */
01135             token = ENTITYREF;
01136             if (++p >= end)
01137                 goto eol;
01138             if (*p == '#') {
01139                 token = CHARREF;
01140                 if (++p >= end)
01141                     goto eol;
01142             }
01143             b = t = p;
01144             if (self->xml) {
01145                 while (ISALNUM(*p) || *p == '.' || *p == '-' || *p == '_' || *p == ':')
01146                     if (++p >= end)
01147                         goto eol;
01148             } else {
01149                 while (ISALNUM(*p) || *p == '.')
01150                     if (++p >= end)
01151                         goto eol;
01152             }
01153             e = p;
01154             if (*p == ';')
01155                 p++;
01156             else
01157                 continue;
01158   
01159         } else {
01160 
01161             /* raw data */
01162             if (++p >= end) {
01163                 q = p;
01164                 goto eol;
01165             }
01166             continue;
01167 
01168         }
01169 
01170       eot: /* end of token */
01171 
01172         if (q != s && self->handle_data) {
01173             /* flush any raw data before this tag */
01174             if (callHandleData(self, s, q-s))
01175                 return -1;
01176         }
01177 
01178         /* invoke callbacks */
01179         if (token & TAG) {
01180             if (token == TAG_END) {
01181                 if (self->finish_endtag) {
01182                     if (callFinishEndTag(self, b, t-b))
01183                         return -1;
01184                 }
01185             } else if (token == DIRECTIVE || token == DOCTYPE) {
01186                 if (self->handle_special) {
01187                     if (callHandleSpecial(self, b, e-b))
01188                         return -1;
01189                 }
01190             } else if (token == PI) {
01191                 if (self->handle_proc) {
01192                     int len = t-b;
01193                     while (ISSPACE(*t))
01194                         t++;
01195                     if ((len==3) && (b[0]=='x') && (b[1]=='m') && (b[2]=='l'))
01196                         fetchEncoding(self, t, e-t);
01197 
01198                     if (callHandleProc(self, b, len, t, e-t))
01199                         return -1;
01200                 }
01201             } else if (self->finish_starttag) {
01202                 PyObject* attr;
01203                 int len = t-b;
01204                 while (ISSPACE(*t))
01205                     t++;
01206                 attr = attrparse(self, t, e-t);
01207                 if (!attr)
01208                     return -1;
01209                 if (callFinishStartTag(self, b, len, attr))
01210                 {
01211                     Py_DECREF(attr);
01212                     return -1;
01213                 }
01214                 Py_DECREF(attr);
01215                 if (token == TAG_EMPTY && self->finish_endtag) {
01216                     if (callFinishEndTag(self, b, len))
01217                        return -1;
01218                 }
01219             }
01220         } else if (token == ENTITYREF && self->handle_entityref) {
01221             if (callHandleEntityRef(self, b, e-b))
01222                 return -1;
01223         } else if (token == CHARREF && (self->handle_charref ||
01224                                         self->handle_data)) {
01225             if (self->handle_charref)
01226             {
01227                 if (callHandleCharRef(self, b, e-b))
01228                     return -1;
01229             }
01230             else {
01231                 /* fallback: handle charref's as data */
01232                 int ch = 0;
01233                 CHAR_T *p;
01234                 if (*b == 'x') {
01235                     for (p = b+1; p < e; p++)
01236                         ch = ch*16 + *p - (*p > 'F' ? 
01237                                            'a'-10 :(*p > '9' ? 
01238                                                    'A'-10 : '0'));
01239                 } else {
01240                     for (p = b; p < e; p++)
01241                         ch = ch*10 + *p - '0';
01242                 }
01243 #ifdef Py_USING_UNICODE
01244                 if (self->unicode) {
01245                     PyObject *res;
01246                     Py_UNICODE uch = ch;
01247                          int maxunicode = PyUnicode_GetMax();
01248 
01249                     if (ch > maxunicode) {
01250                         PyErr_Format(PyExc_ValueError,
01251                             "character reference &#x%x; exceeds sys.maxunicode (0x%x)", ch, maxunicode);
01252                         return -1;
01253                     }
01254                     res = PyObject_CallFunction(self->handle_data,
01255                                                 "u#", &uch, 1);
01256                     if (!res)
01257                         return -1;
01258                     Py_DECREF(res);
01259                 } else
01260 #endif
01261                 {
01262                     char nch;
01263                     if (ch >= 128) {
01264                         /* XXX: should utf-8 encode here for XML; can't do anything for SGML. */
01265                      PyErr_Format(PyExc_ValueError, 
01266                                      "character reference &#x%x; exceeds ASCII range", ch);
01267                      return -1;
01268                     }
01269                     nch = ch;
01270                     if (callHandleData(self, &nch, 1))
01271                         return -1;
01272                 }
01273             }
01274         } else if (token == CDATA && (self->handle_cdata ||
01275                                       self->handle_data)) {
01276             if (self->handle_cdata) {
01277                 if (callHandleCData(self, b, e-b))
01278                     return -1;
01279             } else {
01280                 /* fallback: handle cdata as plain data */
01281                 if (callHandleData(self, b, e-b))
01282                     return -1;
01283             }
01284         } else if (token == COMMENT && self->handle_comment) {
01285             if (callHandleComment(self, b, e-b))
01286                 return -1;
01287         }
01288         
01289         q = p; /* start of token */
01290         s = p; /* start of span */
01291     }
01292 
01293   eol: /* end of line */
01294     if (q != s && self->handle_data) {
01295         if (callHandleData(self, s, q-s))
01296             r