00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057 #include "Python.h"
00058
00059 #include <ctype.h>
00060
00061 #if (PY_MAJOR_VERSION == 1 && PY_MINOR_VERSION > 5) || (PY_MAJOR_VERSION == 2 && PY_MINOR_VERSION < 2)
00062
00063 #define Py_USING_UNICODE
00064 #define PyUnicode_GetMax() (0xffff)
00065 #endif
00066
00067 #ifdef SGMLOP_UNICODE_SUPPORT
00068
00069
00070
00071
00072 #define CHAR_T Py_UNICODE
00073 #define ISALNUM Py_UNICODE_ISALNUM
00074 #define ISSPACE Py_UNICODE_ISSPACE
00075 #define TOLOWER Py_UNICODE_TOLOWER
00076 #else
00077
00078 #define CHAR_T char
00079 #define ISALNUM isalnum
00080 #define ISSPACE isspace
00081 #define TOLOWER tolower
00082 #endif
00083
00084 #if 0
00085 static int memory = 0;
00086 #define ALLOC(size, comment)\
00087 do { memory += size; printf("%8d - %s\n", memory, comment); } while (0)
00088 #define RELEASE(size, comment)\
00089 do { memory -= size; printf("%8d - %s\n", memory, comment); } while (0)
00090 #else
00091 #define ALLOC(size, comment)
00092 #define RELEASE(size, comment)
00093 #endif
00094
00095
00096
00097
00098
00099 #define MAYBE 1
00100 #define SURE 2
00101
00102
00103 typedef struct {
00104 PyObject_HEAD
00105
00106
00107 int xml;
00108 int unicode;
00109 char *encoding;
00110
00111
00112 int feed;
00113 int shorttag;
00114 int doctype;
00115
00116
00117 char* buffer;
00118 int bufferlen;
00119 int buffertotal;
00120
00121
00122 PyObject* finish_starttag;
00123 PyObject* finish_endtag;
00124 PyObject* handle_proc;
00125 PyObject* handle_special;
00126 PyObject* handle_charref;
00127 PyObject* handle_entityref;
00128 PyObject* handle_data;
00129 PyObject* handle_cdata;
00130 PyObject* handle_comment;
00131
00132 } FastSGMLParserObject;
00133
00134 staticforward PyTypeObject FastSGMLParser_Type;
00135
00136
00137 static int fastfeed(FastSGMLParserObject* self);
00138 static PyObject* attrparse(FastSGMLParserObject* self, const CHAR_T* p, int len);
00139 static int fetchEncoding(FastSGMLParserObject* self, const CHAR_T* data, int len);
00140 static PyObject* stringFromData(FastSGMLParserObject* self, const CHAR_T* data, int len);
00141 static int callWithString(FastSGMLParserObject* self, PyObject* callback, const CHAR_T* data, int len);
00142 static int callWith2Strings(FastSGMLParserObject* self, PyObject* callback, const CHAR_T* data1, int len1, const CHAR_T* data2, int len2);
00143 static int callWithStringAndObj(FastSGMLParserObject* self, PyObject* callback, const CHAR_T* data, int len, PyObject* obj);
00144
00145 #define callHandleData(self, data, len) callWithString((self), (self)->handle_data, (data), (len))
00146 #define callHandleCData(self, data, len) callWithString((self), (self)->handle_cdata, (data), (len))
00147 #define callHandleComment(self, data, len) callWithString((self), (self)->handle_comment, (data), (len))
00148 #define callHandleEntityRef(self, data, len) callWithString((self), (self)->handle_entityref, (data), (len))
00149 #define callHandleCharRef(self, data, len) callWithString((self), (self)->handle_charref, (data), (len))
00150 #define callHandleSpecial(self, data, len) callWithString((self), (self)->handle_special, (data), (len))
00151 #define callHandleProc(self, data1, len1, data2, len2) callWith2Strings((self), (self)->handle_proc, (data1), (len1), (data2), (len2))
00152 #define callFinishStartTag(self, data, len, obj) callWithStringAndObj((self), (self)->finish_starttag, (data), (len), (obj))
00153 #define callFinishEndTag(self, data, len) callWithString((self), (self)->finish_endtag, (data), (len))
00154
00155
00156
00157
00158 static PyObject*
00159 _sgmlop_new(int xml, int unicode)
00160 {
00161 FastSGMLParserObject* self;
00162
00163 self = PyObject_NEW(FastSGMLParserObject, &FastSGMLParser_Type);
00164 if (self == NULL)
00165 return NULL;
00166
00167 self->xml = xml;
00168 self->unicode = unicode;
00169 self->encoding = NULL;
00170
00171 self->feed = 0;
00172 self->shorttag = 0;
00173 self->doctype = 0;
00174
00175 self->buffer = NULL;
00176 self->bufferlen = 0;
00177 self->buffertotal = 0;
00178
00179 self->finish_starttag = NULL;
00180 self->finish_endtag = NULL;
00181 self->handle_proc = NULL;
00182 self->handle_special = NULL;
00183 self->handle_charref = NULL;
00184 self->handle_entityref = NULL;
00185 self->handle_data = NULL;
00186 self->handle_cdata = NULL;
00187 self->handle_comment = NULL;
00188
00189 return (PyObject*) self;
00190 }
00191
00192 static PyObject*
00193 _sgmlop_sgmlparser(PyObject* self, PyObject* args)
00194 {
00195 if (!PyArg_NoArgs(args))
00196 return NULL;
00197
00198 return _sgmlop_new(0, 0);
00199 }
00200
00201 static PyObject*
00202 _sgmlop_xmlparser(PyObject* self, PyObject* args)
00203 {
00204 if (!PyArg_NoArgs(args))
00205 return NULL;
00206
00207 return _sgmlop_new(1, 0);
00208 }
00209
00210 static PyObject*
00211 _sgmlop_xmlunicodeparser(PyObject* self, PyObject* args)
00212 {
00213 if (!PyArg_NoArgs(args))
00214 return NULL;
00215
00216 return _sgmlop_new(1, 1);
00217 }
00218
00219 static void
00220 _sgmlop_dealloc(FastSGMLParserObject* self)
00221 {
00222 if (self->buffer)
00223 free(self->buffer);
00224 if (self->encoding)
00225 free(self->encoding);
00226 Py_XDECREF(self->finish_starttag);
00227 Py_XDECREF(self->finish_endtag);
00228 Py_XDECREF(self->handle_proc);
00229 Py_XDECREF(self->handle_special);
00230 Py_XDECREF(self->handle_charref);
00231 Py_XDECREF(self->handle_entityref);
00232 Py_XDECREF(self->handle_data);
00233 Py_XDECREF(self->handle_cdata);
00234 Py_XDECREF(self->handle_comment);
00235 PyMem_DEL(self);
00236 }
00237
00238 #define GETCB(member, name)\
00239 Py_XDECREF(self->member);\
00240 self->member = PyObject_GetAttrString(item, name);
00241
00242 static PyObject*
00243 _sgmlop_register(FastSGMLParserObject* self, PyObject* args)
00244 {
00245
00246 PyObject* item;
00247 if (!PyArg_ParseTuple(args, "O", &item))
00248 return NULL;
00249
00250 GETCB(finish_starttag, "finish_starttag");
00251 GETCB(finish_endtag, "finish_endtag");
00252 GETCB(handle_proc, "handle_proc");
00253 GETCB(handle_special, "handle_special");
00254 GETCB(handle_charref, "handle_charref");
00255 GETCB(handle_entityref, "handle_entityref");
00256 GETCB(handle_data, "handle_data");
00257 GETCB(handle_cdata, "handle_cdata");
00258 GETCB(handle_comment, "handle_comment");
00259
00260 PyErr_Clear();
00261
00262 Py_INCREF(Py_None);
00263 return Py_None;
00264 }
00265
00266
00267
00268
00269
00270
00271 static PyObject*
00272 feed(FastSGMLParserObject* self, char* string, int stringlen, int last)
00273 {
00274
00275
00276 int length;
00277
00278 if (self->feed) {
00279
00280
00281 PyErr_SetString(PyExc_AssertionError, "recursive feed");
00282 return NULL;
00283 }
00284
00285
00286 if (!self->buffer) {
00287 length = stringlen;
00288 self->buffer = malloc(length);
00289 self->buffertotal = stringlen;
00290 } else {
00291 length = self->bufferlen + stringlen;
00292 if (length > self->buffertotal) {
00293 self->buffer = realloc(self->buffer, length);
00294 self->buffertotal = length;
00295 }
00296 }
00297 if (!self->buffer) {
00298 PyErr_NoMemory();
00299 return NULL;
00300 }
00301 memcpy(self->buffer + self->bufferlen, string, stringlen);
00302 self->bufferlen = length;
00303
00304 self->feed = 1;
00305
00306 length = fastfeed(self);
00307
00308 self->feed = 0;
00309
00310 if (length < 0)
00311 return NULL;
00312
00313 if (length > self->bufferlen) {
00314
00315 PyErr_SetString(PyExc_AssertionError, "buffer overrun");
00316 return NULL;
00317 }
00318
00319 if (length > 0 && length < self->bufferlen)
00320
00321 memmove(self->buffer, self->buffer + length,
00322 self->bufferlen - length);
00323
00324 self->bufferlen = self->bufferlen - length;
00325
00326
00327
00328
00329
00330
00331
00332 return Py_BuildValue("i", self->bufferlen);
00333 }
00334
00335 static PyObject*
00336 _sgmlop_feed(FastSGMLParserObject* self, PyObject* args)
00337 {
00338
00339
00340 char* string;
00341 int stringlen;
00342 if (!PyArg_ParseTuple(args, "t#", &string, &stringlen))
00343 return NULL;
00344
00345 return feed(self, string, stringlen, 0);
00346 }
00347
00348 static PyObject*
00349 _sgmlop_close(FastSGMLParserObject* self, PyObject* args)
00350 {
00351
00352
00353 if (!PyArg_NoArgs(args))
00354 return NULL;
00355
00356 return feed(self, "", 0, 1);
00357 }
00358
00359 static PyObject*
00360 _sgmlop_parse(FastSGMLParserObject* self, PyObject* args)
00361 {
00362
00363
00364 char* string;
00365 int stringlen;
00366 if (!PyArg_ParseTuple(args, "t#", &string, &stringlen))
00367 return NULL;
00368
00369 return feed(self, string, stringlen, 1);
00370 }
00371
00372
00373
00374
00375
00376 static PyMethodDef _sgmlop_methods[] = {
00377
00378 {"register", (PyCFunction) _sgmlop_register, 1},
00379
00380 {"feed", (PyCFunction) _sgmlop_feed, 1},
00381 {"close", (PyCFunction) _sgmlop_close, 0},
00382
00383 {"parse", (PyCFunction) _sgmlop_parse, 1},
00384 {NULL, NULL}
00385 };
00386
00387 static PyObject*
00388 _sgmlop_getattr(FastSGMLParserObject* self, char* name)
00389 {
00390 return Py_FindMethod(_sgmlop_methods, (PyObject*) self, name);
00391 }
00392
00393 statichere PyTypeObject FastSGMLParser_Type = {
00394 PyObject_HEAD_INIT(NULL)
00395 0,
00396 "FastSGMLParser",
00397 sizeof(FastSGMLParserObject),
00398 0,
00399
00400 (destructor)_sgmlop_dealloc,
00401 0,
00402 (getattrfunc)_sgmlop_getattr,
00403 0
00404 };
00405
00406
00407
00408
00409 typedef struct {
00410 PyObject_HEAD
00411
00412
00413 PyObject* parent;
00414 PyObject* tag;
00415 PyObject* attrib;
00416 PyObject* text;
00417 PyObject* suffix;
00418
00419
00420 int child_count;
00421 int child_total;
00422 PyObject* *children;
00423
00424
00425
00426
00427
00428
00429
00430
00431
00432 } ElementObject;
00433
00434 staticforward PyTypeObject Element_Type;
00435
00436
00437
00438
00439 static PyObject*
00440 element_new(PyObject* _self, PyObject* args)
00441 {
00442 ElementObject* self;
00443
00444 PyObject* parent;
00445 PyObject* tag;
00446 PyObject* attrib = Py_None;
00447 PyObject* text = Py_None;
00448 PyObject* suffix = Py_None;
00449 if (!PyArg_ParseTuple(args, "OO|OOO", &parent, &tag,
00450 &attrib, &text, &suffix))
00451 return NULL;
00452
00453 if (parent != Py_None && parent->ob_type != &Element_Type) {
00454 PyErr_SetString(PyExc_TypeError, "parent must be Element or None");
00455 return NULL;
00456 }
00457
00458 self = PyObject_NEW(ElementObject, &Element_Type);
00459 if (self == NULL)
00460 return NULL;
00461
00462 Py_INCREF(parent);
00463 self->parent = parent;
00464
00465 Py_INCREF(tag);
00466 self->tag = tag;
00467
00468 Py_INCREF(attrib);
00469 self->attrib = attrib;
00470
00471 Py_INCREF(text);
00472 self->text = text;
00473
00474 Py_INCREF(suffix);
00475 self->suffix = suffix;
00476
00477 self->child_count = 0;
00478 self->child_total = 0;
00479 self->children = NULL;
00480
00481 ALLOC(sizeof(ElementObject), "create element");
00482
00483 return (PyObject*) self;
00484 }
00485
00486 static void
00487 element_dealloc(ElementObject* self)
00488 {
00489 int i;
00490
00491
00492
00493
00494
00495 if (self->children) {
00496 for (i = 0; i < self->child_count; i++)
00497 Py_DECREF(self->children[i]);
00498 free(self->children);
00499 }
00500
00501
00502 Py_DECREF(self->parent);
00503
00504
00505 Py_DECREF(self->tag);
00506 Py_XDECREF(self->attrib);
00507 Py_XDECREF(self->text);
00508 Py_XDECREF(self->suffix);
00509
00510 RELEASE(sizeof(ElementObject), "destroy element");
00511
00512 PyMem_DEL(self);
00513 }
00514
00515
00516
00517
00518 static PyObject*
00519 element_append(ElementObject* self, PyObject* args)
00520 {
00521 int total;
00522
00523 PyObject* element;
00524 if (!PyArg_ParseTuple(args, "O!", &Element_Type, &element))
00525 return NULL;
00526
00527 if (!self->children) {
00528 total = 10;
00529 self->children = malloc(total * sizeof(PyObject*));
00530 self->child_total = total;
00531 } else if (self->child_count >= self->child_total) {
00532 total = self->child_total + 10;
00533 self->children = realloc(self->children, total * sizeof(PyObject*));
00534 self->child_total = total;
00535 }
00536 if (!self->children) {
00537 PyErr_NoMemory();
00538 return NULL;
00539 }
00540
00541 Py_INCREF(element);
00542 self->children[self->child_count++] = element;
00543
00544 Py_INCREF(Py_None);
00545 return Py_None;
00546 }
00547
00548 static PyObject*
00549 element_destroy(ElementObject* self, PyObject* args)
00550 {
00551 int i;
00552 PyObject* res;
00553
00554 if (!PyArg_NoArgs(args))
00555 return NULL;
00556
00557
00558 if (self->parent != Py_None) {
00559 Py_DECREF(self->parent);
00560 self->parent = Py_None;
00561 Py_INCREF(self->parent);
00562 }
00563
00564
00565 if (self->children) {
00566 for (i = 0; i < self->child_count; i++) {
00567 res = element_destroy((ElementObject*) self->children[i], args);
00568 Py_DECREF(res);
00569 Py_DECREF(self->children[i]);
00570 }
00571 self->child_count = 0;
00572 }
00573
00574
00575
00576 Py_INCREF(Py_None);
00577 return Py_None;
00578 }
00579
00580 static PyObject *
00581 element_get(ElementObject* self, PyObject* args)
00582 {
00583 PyObject* value;
00584
00585 PyObject* key;
00586 PyObject* default_value = Py_None;
00587 if (!PyArg_ParseTuple(args, "O|O", &key, &default_value))
00588 return NULL;
00589
00590 value = PyDict_GetItem(self->attrib, key);
00591 if (!value) {
00592 value = default_value;
00593 PyErr_Clear();
00594 }
00595
00596 Py_INCREF(value);
00597 return value;
00598 }
00599
00600 static PyObject*
00601 element_getitem(ElementObject* self, int index)
00602 {
00603 if (index < 0 || index >= self->child_count) {
00604 PyErr_SetString(PyExc_IndexError, "child index out of range");
00605 return NULL;
00606 }
00607
00608 Py_INCREF(self->children[index]);
00609 return self->children[index];
00610 }
00611
00612 static int
00613 element_length(ElementObject* self)
00614 {
00615 return self->child_count;
00616 }
00617
00618 static PyObject*
00619 element_repr(ElementObject* self)
00620 {
00621 char buf[300];
00622 if (PyString_Check(self->tag))
00623 sprintf(
00624 buf, "<Element object '%.256s' at %lx>",
00625 PyString_AsString(self->tag),
00626 (long) self
00627 );
00628 else
00629 sprintf(
00630 buf, "<Element object at %lx>",
00631 (long) self
00632 );
00633
00634 return PyString_FromString(buf);
00635 }
00636
00637
00638
00639
00640 static PyMethodDef element_methods[] = {
00641 {"get", (PyCFunction) element_get, 1},
00642 {"append", (PyCFunction) element_append, 1},
00643 {"destroy", (PyCFunction) element_destroy, 0},
00644 {NULL, NULL}
00645 };
00646
00647 static PyObject*
00648 element_getattr(ElementObject* self, char* name)
00649 {
00650 PyObject* res;
00651
00652 res = Py_FindMethod(element_methods, (PyObject*) self, name);
00653 if (res)
00654 return res;
00655
00656 PyErr_Clear();
00657
00658 if (strcmp(name, "tag") == 0)
00659 res = self->tag;
00660 else if (strcmp(name, "text") == 0)
00661 res = self->text;
00662 else if (strcmp(name, "suffix") == 0)
00663 res = self->suffix;
00664 else if (strcmp(name, "attrib") == 0)
00665 res = self->attrib;
00666 else if (strcmp(name, "parent") == 0)
00667 res = self->parent;
00668 else {
00669 PyErr_SetString(PyExc_AttributeError, name);
00670 return NULL;
00671 }
00672
00673 Py_INCREF(res);
00674 return res;
00675 }
00676
00677 static int
00678 element_setattr(ElementObject *self, const char* name, PyObject* value)
00679 {
00680 if (value == NULL) {
00681 PyErr_SetString(PyExc_AttributeError,
00682 "can't delete element attributes");
00683 return -1;
00684 }
00685
00686 if (strcmp(name, "text") == 0) {
00687
00688 Py_DECREF(self->text);
00689 self->text = value;
00690 Py_INCREF(self->text);
00691
00692 } else if (strcmp(name, "suffix") == 0) {
00693
00694 Py_DECREF(self->suffix);
00695 self->suffix = value;
00696 Py_INCREF(self->suffix);
00697
00698 } else if (strcmp(name, "attrib") == 0) {
00699
00700 Py_DECREF(self->attrib);
00701 self->attrib = value;
00702 Py_INCREF(self->attrib);
00703
00704 } else {
00705
00706 PyErr_SetString(PyExc_AttributeError, name);
00707 return -1;
00708
00709 }
00710
00711 return 0;
00712 }
00713
00714 static PySequenceMethods element_as_sequence = {
00715 (inquiry) element_length,
00716 0,
00717 0,
00718 (intargfunc) element_getitem,
00719 0,
00720 0,
00721 0,
00722 };
00723
00724 statichere PyTypeObject Element_Type = {
00725 PyObject_HEAD_INIT(NULL)
00726 0,
00727 "Element",
00728 sizeof(ElementObject),
00729 0,
00730
00731 (destructor)element_dealloc,
00732 0,
00733 (getattrfunc)element_getattr,
00734 (setattrfunc)element_setattr,
00735 0,
00736 (reprfunc)element_repr,
00737 0,
00738 &element_as_sequence,
00739 0
00740 };
00741
00742
00743
00744
00745
00746 typedef struct {
00747 PyObject_HEAD
00748
00749 PyObject* root;
00750
00751 PyObject* this;
00752 PyObject* last;
00753 PyObject* data;
00754
00755 } TreeBuilderObject;
00756
00757 staticforward PyTypeObject TreeBuilder_Type;
00758
00759
00760
00761
00762 static PyObject*
00763 treebuilder_new(PyObject* _self, PyObject* args)
00764 {
00765 TreeBuilderObject* self;
00766
00767
00768 if (!PyArg_NoArgs(args))
00769 return NULL;
00770
00771 self = PyObject_NEW(TreeBuilderObject, &TreeBuilder_Type);
00772 if (self == NULL)
00773 return NULL;
00774
00775 Py_INCREF(Py_None);
00776 self->root = Py_None;
00777
00778 self->this = NULL;
00779 self->last = NULL;
00780 self->data = NULL;
00781
00782 return (PyObject*) self;
00783 }
00784
00785 static void
00786 treebuilder_dealloc(TreeBuilderObject* self)
00787 {
00788 Py_XDECREF(self->data);
00789 Py_XDECREF(self->last);
00790 Py_XDECREF(self->this);
00791 Py_DECREF(self->root);
00792 PyMem_DEL(self);
00793 }
00794
00795
00796
00797
00798 static PyObject*
00799 treebuilder_start(TreeBuilderObject* self, PyObject* args)
00800 {
00801 PyObject* tag;
00802 PyObject* attrib = Py_None;
00803 if (!PyArg_ParseTuple(args, "O|O", &tag, &attrib))
00804 return NULL;
00805
00806
00807
00808 Py_INCREF(Py_None);
00809 return Py_None;
00810 }
00811
00812 static PyObject*
00813 treebuilder_end(TreeBuilderObject* self, PyObject* args)
00814 {
00815 PyObject* tag;
00816 if (!PyArg_ParseTuple(args, "O", &tag))
00817 return NULL;
00818
00819
00820
00821 Py_INCREF(Py_None);
00822 return Py_None;
00823 }
00824
00825 static PyObject *
00826 treebuilder_data(TreeBuilderObject* self, PyObject* args)
00827 {
00828 PyObject* data;
00829 if (!PyArg_ParseTuple(args, "O", &data))
00830 return NULL;
00831
00832
00833
00834 Py_INCREF(Py_None);
00835 return Py_None;
00836 }
00837
00838
00839
00840
00841 static PyMethodDef treebuilder_methods[] = {
00842 {"data", (PyCFunction) treebuilder_data, 1},
00843 {"start", (PyCFunction) treebuilder_start, 1},
00844 {"end", (PyCFunction) treebuilder_end, 1},
00845 {NULL, NULL}
00846 };
00847
00848 static PyObject*
00849 treebuilder_getattr(ElementObject* self, char* name)
00850 {
00851 return Py_FindMethod(treebuilder_methods, (PyObject*) self, name);
00852 }
00853
00854 statichere PyTypeObject TreeBuilder_Type = {
00855 PyObject_HEAD_INIT(NULL)
00856 0,
00857 "TreeBuilder",
00858 sizeof(TreeBuilderObject),
00859 0,
00860
00861 (destructor)treebuilder_dealloc,
00862 0,
00863 (getattrfunc)treebuilder_getattr,
00864 0,
00865 0,
00866 0,
00867 0,
00868 0,
00869 0
00870 };
00871
00872
00873
00874
00875
00876 static PyMethodDef _functions[] = {
00877 {"SGMLParser", _sgmlop_sgmlparser, 0},
00878 {"XMLParser", _sgmlop_xmlparser, 0},
00879 {"XMLUnicodeParser", _sgmlop_xmlunicodeparser, 0},
00880 {"Element", element_new, 1},
00881 {"TreeBuilder", treebuilder_new, 0},
00882 {NULL, NULL}
00883 };
00884
00885 DL_EXPORT(void)
00886 initsgmlop(void)
00887 {
00888
00889 FastSGMLParser_Type.ob_type =
00890 Element_Type.ob_type =
00891 TreeBuilder_Type.ob_type = &PyType_Type;
00892
00893 Py_InitModule("sgmlop", _functions);
00894 }
00895
00896
00897
00898
00899
00900
00901 #define TAG 0x100
00902 #define TAG_START 0x101
00903 #define TAG_END 0x102
00904 #define TAG_EMPTY 0x103
00905 #define DIRECTIVE 0x104
00906 #define DOCTYPE 0x105
00907 #define PI 0x106
00908 #define DTD_START 0x107
00909 #define DTD_END 0x108
00910 #define DTD_ENTITY 0x109
00911 #define CDATA 0x200
00912 #define ENTITYREF 0x400
00913 #define CHARREF 0x401
00914 #define COMMENT 0x800
00915
00916 static int
00917 fastfeed(FastSGMLParserObject* self)
00918 {
00919 CHAR_T *end;
00920 CHAR_T *p, *q, *s;
00921 CHAR_T *b, *t, *e;
00922
00923 int token;
00924
00925 s = q = p = (CHAR_T*) self->buffer;
00926 end = (CHAR_T*) (self->buffer + self->bufferlen);
00927
00928 while (p < end) {
00929
00930 q = p;
00931
00932 if (*p == '<') {
00933 int has_attr;
00934
00935
00936 token = TAG_START;
00937 if (++p >= end)
00938 goto eol;
00939
00940 if (*p == '!') {
00941
00942 if (++p >= end)
00943 goto eol;
00944 token = DIRECTIVE;
00945 b = t = p;
00946 if (*p == '-') {
00947
00948 token = COMMENT;
00949 b = p + 2;
00950 for (;;) {
00951 if (p+3 >= end)
00952 goto eol;
00953 if (p[1] != '-')
00954 p += 2;
00955 else if (p[0] != '-' || p[2] != '>')
00956 p++;
00957 else
00958 break;
00959 }
00960 e = p;
00961 p += 3;
00962 goto eot;
00963 } else if (self->xml) {
00964
00965
00966
00967
00968 if (*p == 'D' ) {
00969
00970
00971 token = DOCTYPE;
00972 self->doctype = MAYBE;
00973 } else if (*p == '[') {
00974
00975
00976
00977
00978 token = CDATA;
00979 b = t = p + 7;
00980 for (;;) {
00981 if (p+3 >= end)
00982 goto eol;
00983 if (p[1] != ']')
00984 p += 2;
00985 else if (p[0] != ']' || p[2] != '>')
00986 p++;
00987 else
00988 break;
00989 }
00990 e = p;
00991 p += 3;
00992 goto eot;
00993 }
00994 }
00995 } else if (*p == '?') {
00996 token = PI;
00997 if (++p >= end)
00998 goto eol;
00999 } else if (*p == '/') {
01000
01001 token = TAG_END;
01002 if (++p >= end)
01003 goto eol;
01004 }
01005
01006
01007 b = p;
01008 if (!self->xml)
01009 while (ISALNUM(*p) || *p == '-' || *p == '.' ||
01010 *p == ':' || *p == '?') {
01011 *p = (CHAR_T) TOLOWER(*p);
01012 if (++p >= end)
01013 goto eol;
01014 }
01015 else
01016 while (ISALNUM(*p) || *p == '-' || *p == '.' || *p == '_' ||
01017 *p == ':' || *p == '?') {
01018 if (++p >= end)
01019 goto eol;
01020 }
01021
01022 t = p;
01023
01024 has_attr = 0;
01025
01026 if (*p == '/' && !self->xml) {
01027
01028 token = TAG_START;
01029 e = p;
01030 if (++p >= end)
01031 goto eol;
01032 if (*p == '>') {
01033
01034 token = TAG_EMPTY;
01035 if (++p >= end)
01036 goto eol;
01037 } else
01038
01039 self->shorttag = SURE;
01040
01041
01042
01043 } else {
01044
01045
01046 int quote = 0;
01047 int last = 0;
01048 if (token==PI && self->xml) {
01049 int found = 0;
01050 while ((*p!='>') || (!found)) {
01051 found = (*p=='?');
01052 if (++p >= end)
01053 goto eol;
01054 }
01055 last = '?';
01056 }
01057 else {
01058 while (*p != '>' || quote) {
01059 if (!ISSPACE(*p)) {
01060 has_attr = 1;
01061
01062 }
01063 if (quote) {
01064 if (*p == quote)
01065 quote = 0;
01066 } else {
01067 if (*p == '"' || *p == '\'')
01068 quote = *p;
01069 }
01070 if (*p == '[' && !quote && self->doctype) {
01071 self->doctype = SURE;
01072 token = DTD_START;
01073 e = p++;
01074 goto eot;
01075 }
01076 last = *p;
01077 if (++p >= end)
01078 goto eol;
01079 }
01080 }
01081
01082 e = p++;
01083
01084 if (last == '/') {
01085
01086 e--;
01087 token = TAG_EMPTY;
01088 } else if (token == PI && last == '?')
01089 e--;
01090
01091 if (self->doctype == MAYBE)
01092 self->doctype = 0;
01093
01094 if (has_attr)
01095 ;
01096
01097 }
01098
01099 } else if (*p == '/' && self->shorttag) {
01100
01101
01102 token = TAG_END;
01103 self->shorttag = 0;
01104 b = t = e = p;
01105 if (++p >= end)
01106 goto eol;
01107
01108 } else if (*p == ']' && self->doctype) {
01109
01110
01111 token = DTD_END;
01112
01113 b = t = e = p;
01114 if (++p >= end)
01115 goto eol;
01116 self->doctype = 0;
01117
01118 } else if (*p == '%' && self->doctype) {
01119
01120
01121 token = DTD_ENTITY;
01122 if (++p >= end)
01123 goto eol;
01124 b = t = p;
01125 while (ISALNUM(*p) || *p == '.')
01126 if (++p >= end)
01127 goto eol;
01128 e = p;
01129 if (*p == ';')
01130 p++;
01131
01132 } else if (*p == '&') {
01133
01134
01135 token = ENTITYREF;
01136 if (++p >= end)
01137 goto eol;
01138 if (*p == '#') {
01139 token = CHARREF;
01140 if (++p >= end)
01141 goto eol;
01142 }
01143 b = t = p;
01144 if (self->xml) {
01145 while (ISALNUM(*p) || *p == '.' || *p == '-' || *p == '_' || *p == ':')
01146 if (++p >= end)
01147 goto eol;
01148 } else {
01149 while (ISALNUM(*p) || *p == '.')
01150 if (++p >= end)
01151 goto eol;
01152 }
01153 e = p;
01154 if (*p == ';')
01155 p++;
01156 else
01157 continue;
01158
01159 } else {
01160
01161
01162 if (++p >= end) {
01163 q = p;
01164 goto eol;
01165 }
01166 continue;
01167
01168 }
01169
01170 eot:
01171
01172 if (q != s && self->handle_data) {
01173
01174 if (callHandleData(self, s, q-s))
01175 return -1;
01176 }
01177
01178
01179 if (token & TAG) {
01180 if (token == TAG_END) {
01181 if (self->finish_endtag) {
01182 if (callFinishEndTag(self, b, t-b))
01183 return -1;
01184 }
01185 } else if (token == DIRECTIVE || token == DOCTYPE) {
01186 if (self->handle_special) {
01187 if (callHandleSpecial(self, b, e-b))
01188 return -1;
01189 }
01190 } else if (token == PI) {
01191 if (self->handle_proc) {
01192 int len = t-b;
01193 while (ISSPACE(*t))
01194 t++;
01195 if ((len==3) && (b[0]=='x') && (b[1]=='m') && (b[2]=='l'))
01196 fetchEncoding(self, t, e-t);
01197
01198 if (callHandleProc(self, b, len, t, e-t))
01199 return -1;
01200 }
01201 } else if (self->finish_starttag) {
01202 PyObject* attr;
01203 int len = t-b;
01204 while (ISSPACE(*t))
01205 t++;
01206 attr = attrparse(self, t, e-t);
01207 if (!attr)
01208 return -1;
01209 if (callFinishStartTag(self, b, len, attr))
01210 {
01211 Py_DECREF(attr);
01212 return -1;
01213 }
01214 Py_DECREF(attr);
01215 if (token == TAG_EMPTY && self->finish_endtag) {
01216 if (callFinishEndTag(self, b, len))
01217 return -1;
01218 }
01219 }
01220 } else if (token == ENTITYREF && self->handle_entityref) {
01221 if (callHandleEntityRef(self, b, e-b))
01222 return -1;
01223 } else if (token == CHARREF && (self->handle_charref ||
01224 self->handle_data)) {
01225 if (self->handle_charref)
01226 {
01227 if (callHandleCharRef(self, b, e-b))
01228 return -1;
01229 }
01230 else {
01231
01232 int ch = 0;
01233 CHAR_T *p;
01234 if (*b == 'x') {
01235 for (p = b+1; p < e; p++)
01236 ch = ch*16 + *p - (*p > 'F' ?
01237 'a'-10 :(*p > '9' ?
01238 'A'-10 : '0'));
01239 } else {
01240 for (p = b; p < e; p++)
01241 ch = ch*10 + *p - '0';
01242 }
01243 #ifdef Py_USING_UNICODE
01244 if (self->unicode) {
01245 PyObject *res;
01246 Py_UNICODE uch = ch;
01247 int maxunicode = PyUnicode_GetMax();
01248
01249 if (ch > maxunicode) {
01250 PyErr_Format(PyExc_ValueError,
01251 "character reference &#x%x; exceeds sys.maxunicode (0x%x)", ch, maxunicode);
01252 return -1;
01253 }
01254 res = PyObject_CallFunction(self->handle_data,
01255 "u#", &uch, 1);
01256 if (!res)
01257 return -1;
01258 Py_DECREF(res);
01259 } else
01260 #endif
01261 {
01262 char nch;
01263 if (ch >= 128) {
01264
01265 PyErr_Format(PyExc_ValueError,
01266 "character reference &#x%x; exceeds ASCII range", ch);
01267 return -1;
01268 }
01269 nch = ch;
01270 if (callHandleData(self, &nch, 1))
01271 return -1;
01272 }
01273 }
01274 } else if (token == CDATA && (self->handle_cdata ||
01275 self->handle_data)) {
01276 if (self->handle_cdata) {
01277 if (callHandleCData(self, b, e-b))
01278 return -1;
01279 } else {
01280
01281 if (callHandleData(self, b, e-b))
01282 return -1;
01283 }
01284 } else if (token == COMMENT && self->handle_comment) {
01285 if (callHandleComment(self, b, e-b))
01286 return -1;
01287 }
01288
01289 q = p;
01290 s = p;
01291 }
01292
01293 eol:
01294 if (q != s && self->handle_data) {
01295 if (callHandleData(self, s, q-s))
01296 r