1function [tree, RootName, DOMnode] = xml_read(xmlfile, Pref)
2%XML_READ reads xml files and converts them into Matlab
's struct tree.
5% tree = xml_read(xmlfile) reads 'xmlfile
' into data structure 'tree
'
7% tree = xml_read(xmlfile, Pref) reads 'xmlfile
' into data structure 'tree
'
8% according to your preferences
10% [tree, RootName, DOMnode] = xml_read(xmlfile) get additional information
14% xmlfile URL or filename of xml file to read
16% Pref.ItemName - default 'item
' - name of a special tag used to itemize
18% Pref.ReadAttr - default true - allow reading attributes
19% Pref.ReadSpec - default true - allow reading special nodes
20% Pref.Str2Num - default 'smart
' - convert strings that look like numbers
21% to numbers. Options: "always", "never", and "smart"
22% Pref.KeepNS - default true - keep or strip namespace info
23% Pref.NoCells - default true - force output to have no cell arrays
24% Pref.Debug - default false - show mode specific error messages
25% Pref.NumLevels- default infinity - how many recursive levels are
26% allowed. Can be used to speed up the function by prunning the tree.
27% Pref.RootOnly - default true - output variable 'tree
' corresponds to
28% xml file root element, otherwise it correspond to the whole file.
29% Pref.CellItem - default 'true' - leave 'item
' nodes in cell notation.
31% tree tree of structs and/or cell arrays corresponding to xml file
32% RootName XML tag name used for root (top level) node.
33% Optionally it can be a string cell array storing: Name of
34% root node, document "Processing Instructions" data and
35% document "comment" string
36% DOMnode output of xmlread
39% Function xml_read first calls MATLAB's xmlread function and than
40% converts its output (
'Document Object Model' tree of Java objects)
41% to tree of MATLAB
struct's. The output is in format of nested structs
42% and cells. In the output data structure field names are based on
43% XML tags, except in cases when tags produce illegal variable names.
45% Several special xml node types result in special tags for fields of
47% - node.CONTENT - stores data section of the node if other fields are
48% present. Usually data section is stored directly in 'node
'.
49% - node.ATTRIBUTE.name - stores node's attribute called
'name'.
50% - node.COMMENT - stores node
's comment section (string). For global
51% comments see "RootName" output variable.
52% - node.CDATA_SECTION - stores node's CDATA section (
string).
53% - node.PROCESSING_INSTRUCTIONS - stores
"processing instruction" child
54% node. For global
"processing instructions" see
"RootName" output variable.
55% - other special node types like: document fragment
nodes, document type
57% will be treated like regular
nodes
61% MyTree.MyNumber = 13;
62% MyTree.MyString =
'Hello World';
63% xml_write(
'test.xml', MyTree);
64% [tree treeName] = xml_read (
'test.xml');
67% % See also xml_examples.m
70% xml_write, xmlread, xmlwrite
72% Written by Jarek Tuszynski, SAIC, jaroslaw.w.tuszynski_at_saic.com
74% - Function inspired by Example 3 found in xmlread function.
75% - Output data structures inspired by xml_toolbox structures.
78DPref.TableName = {
'tr',
'td'}; % name of a special tags used to itemize 2D cell arrays
79DPref.ItemName =
'item'; % name of a special tag used to itemize 1D cell arrays
80DPref.CellItem =
false; % leave
'item' nodes in cell notation
81DPref.ReadAttr =
true; % allow reading attributes
82DPref.ReadSpec =
true; % allow reading special
nodes: comments, CData, etc.
83DPref.KeepNS =
true; % Keep or strip
namespace info
84DPref.Str2Num =
'smart';% convert strings that look like numbers to numbers
85DPref.NoCells =
true; % force output to have no cell arrays
86DPref.NumLevels = 1e10; % number of recurence levels
87DPref.PreserveSpace =
false; % Preserve or
delete spaces at the beggining and the end of stings?
88RootOnly =
true; %
return root node with no top level special
nodes
89Debug =
false; % show specific errors (
true) or general (false)?
93%% Check Matlab Version
95version = str2double(regexp(v.Version, '\d.\d','match','once'));
97% error('Your MATLAB version
is too old. You need version 7.1 or newer.');
100%% read user preferences
102 if (isfield(Pref, 'TableName')), DPref.TableName = Pref.TableName; end
103 if (isfield(Pref, 'ItemName' )), DPref.ItemName = Pref.ItemName; end
104 if (isfield(Pref, 'CellItem' )), DPref.CellItem = Pref.CellItem; end
105 if (isfield(Pref, 'Str2Num' )), DPref.Str2Num = Pref.Str2Num ; end
106 if (isfield(Pref, 'NoCells' )), DPref.NoCells = Pref.NoCells ; end
107 if (isfield(Pref, 'NumLevels')), DPref.NumLevels = Pref.NumLevels; end
108 if (isfield(Pref, 'ReadAttr' )), DPref.ReadAttr = Pref.ReadAttr; end
109 if (isfield(Pref, 'ReadSpec' )), DPref.ReadSpec = Pref.ReadSpec; end
110 if (isfield(Pref, 'KeepNS' )), DPref.KeepNS = Pref.KeepNS; end
111 if (isfield(Pref, 'RootOnly' )), RootOnly = Pref.RootOnly; end
112 if (isfield(Pref, 'Debug' )), Debug = Pref.Debug ; end
113 if (isfield(Pref, 'PreserveSpace')), DPref.PreserveSpace = Pref.PreserveSpace; end
115if ischar(DPref.Str2Num), % convert from character description to numbers
116 DPref.Str2Num = find(strcmpi(DPref.Str2Num, {
'never',
'smart',
'always'}))-1;
117 if isempty(DPref.Str2Num), DPref.Str2Num=1; end % 1-smart by
default
120%% read xml file
using Matlab function
121if isa(xmlfile,
'org.apache.xerces.dom.DeferredDocumentImpl');
122 %
if xmlfile
is a DOMnode than skip the call to xmlread
127 error(
'Invalid DOM node: \n%s.', getReport(ME,
'basic'));
129 catch %#ok<CTCH>
catch for mablab versions prior to 7.5
130 error(
'Invalid DOM node. \n');
132else % we assume xmlfile
is a filename
133 if (Debug) % in debuging mode crashes are allowed
134 DOMnode = xmlread(xmlfile);
135 else % in normal mode crashes are not allowed
138 DOMnode = xmlread(xmlfile);
140 error(sprintf(
'Failed to read XML file %s: \n%s',xmlfile, getReport(ME,
'basic')));
142 catch %#ok<CTCH>
catch for matlab versions prior to 7.5
143 error(sprintf(
'Failed to read XML file %s\n',xmlfile));
147Node = DOMnode.getFirstChild;
149%% Find the Root node. Also store data from Global Comment and Processing
150% Instruction
nodes,
if any.
151GlobalTextNodes = cell(1,3);
155while (~isempty(Node))
156 if (Node.getNodeType==Node.ELEMENT_NODE)
158 elseif (Node.getNodeType==Node.PROCESSING_INSTRUCTION_NODE)
159 data = strtrim(
char(Node.getData));
160 target = strtrim(
char(Node.getTarget));
161 GlobalProcInst = [target, ' ', data];
162 GlobalTextNodes{2} = GlobalProcInst;
163 elseif (Node.getNodeType==Node.COMMENT_NODE)
164 GlobalComment = strtrim(
char(Node.getData));
165 GlobalTextNodes{3} = GlobalComment;
166 % elseif (Node.getNodeType==Node.DOCUMENT_TYPE_NODE)
167 % GlobalTextNodes{4} = GlobalDocType;
169 Node = Node.getNextSibling;
172%% parse xml file through calls to recursive DOMnode2struct function
173if (Debug) % in debuging mode crashes are allowed
174 [tree RootName] = DOMnode2struct(RootNode, DPref, 1);
175else % in normal mode crashes are not allowed
178 [tree RootName] = DOMnode2struct(RootNode, DPref, 1);
180 error(
'Unable to parse XML file %s: \n %s.',xmlfile, getReport(ME,
'basic'));
182 catch %#ok<CTCH>
catch for mablab versions prior to 7.5
183 error(
'Unable to parse XML file %s.',xmlfile);
187%% If there were any Global Text
nodes than
return them
189 if (~isempty(GlobalProcInst) && DPref.ReadSpec)
190 t.PROCESSING_INSTRUCTION = GlobalProcInst;
192 if (~isempty(GlobalComment) && DPref.ReadSpec)
193 t.COMMENT = GlobalComment;
195 if (~isempty(GlobalDocType) && DPref.ReadSpec)
196 t.DOCUMENT_TYPE = GlobalDocType;
201if (~isempty(GlobalTextNodes))
202 GlobalTextNodes{1} = RootName;
203 RootName = GlobalTextNodes;
207%% =======================================================================
208% === DOMnode2struct Function ===========================================
209% =======================================================================
210function [s TagName LeafNode] = DOMnode2struct(node, Pref, level)
212%% === Step 1: Get node name and check
if it
is a leaf node ==============
213[TagName LeafNode] = NodeName(node, Pref.KeepNS);
214s = []; % initialize output structure
216%% === Step 2: Process Leaf Nodes (
nodes with no children) ===============
218 if (LeafNode>1 && ~Pref.ReadSpec), LeafNode=-1; end % tags only so ignore special
nodes
219 if (LeafNode>0) % supported leaf node types
221 try % use
try-
catch: errors here are often due to VERY large fields (like images) that overflow java memory
222 s = char(node.getData);
223 if (isempty(s)), s =
' '; end % make it a
string
224 %
for some reason current xmlread
'creates' a lot of empty text
225 % fields with first chatacter=10 - those will be deleted.
226 if (~Pref.PreserveSpace || s(1)==10)
227 if (isspace(s(1)) || isspace(s(end))), s = strtrim(s); end % trim speces
is any
229 if (LeafNode==1), s=str2var(s, Pref.Str2Num, 0); end % convert to number(s)
if needed
230 catch ME %
catch for mablab versions 7.5 and higher
231 warning(
'xml_io_tools:read:LeafRead', ...
232 'This leaf node could not be read and was ignored. ');
233 getReport(ME,
'basic')
235 catch %
#ok<CTCH> catch for mablab versions prior to 7.5
236 warning(
'xml_io_tools:read:LeafRead', ...
237 'This leaf node could not be read and was ignored. ');
240 if (LeafNode==3) % ProcessingInstructions need special treatment
241 target = strtrim(
char(node.getTarget));
242 s = [target,
' ', s];
244 return % We are done the rest of the function deals with
nodes with children
246if (level>Pref.NumLevels+1),
return; end %
if Pref.NumLevels
is reached than we are done
248%% === Step 3: Process
nodes with children ===============================
249if (node.hasChildNodes) % children present
250 Child = node.getChildNodes; % create array of children
nodes
251 nChild = Child.getLength; % number of children
253 % --- pass 1: how many children with each name -----------------------
255 for iChild = 1:nChild % read in each child
256 [cname cLeaf] = NodeName(Child.item(iChild-1), Pref.KeepNS);
257 if (cLeaf<0),
continue; end % unsupported leaf node types
258 if (~isfield(f,cname)),
259 f.(cname)=0; % initialize first time I see
this name
261 f.(cname) = f.(cname)+1; % add to the counter
263 % text_nodes become CONTENT &
for some reason current xmlread
'creates' a
264 % lot of empty text fields so f.CONTENT value should not be trusted
265 if (isfield(f,
'CONTENT') && f.CONTENT>2), f.CONTENT=2; end
267 % --- pass 2: store all the children as
struct of cell arrays ----------
268 for iChild = 1:nChild % read in each child
269 [c cname cLeaf] = DOMnode2struct(Child.item(iChild-1), Pref, level+1);
270 if (cLeaf && isempty(c)) %
if empty leaf node than skip
271 continue; % usually empty text node or one of unhandled node types
272 elseif (nChild==1 && cLeaf==1)
273 s=c; % shortcut for a common case
274 else % if normal node
275 if (level>Pref.NumLevels), continue; end
276 n = f.(cname); % how many of them in the array so far?
277 if (~isfield(s,cname)) % encountered this name for the first time
278 if (n==1) % if there will be only one of them ...
279 s.(cname) = c; % than save it in format it came in
280 else % if there will be many of them ...
281 s.(cname) = cell(1,n);
282 s.(cname){1} = c; % than save as cell array
284 f.(cname) = 1; % initialize the counter
285 else % already have seen
this name
286 s.(cname){n+1} = c; % add to the array
287 f.(cname) = n+1; % add to the array counter
291end % end
if (node.hasChildNodes)
293%% === Step 4: Post-process
struct's created for nodes with children =====
295 fields = fieldnames(s);
296 nField = length(fields);
298 % Detect structure that looks like Html table and store it in cell Matrix
299 if (nField==1 && strcmpi(fields{1},Pref.TableName{1}))
300 tr = s.(Pref.TableName{1});
301 fields2 = fieldnames(tr{1});
302 if (length(fields2)==1 && strcmpi(fields2{1},Pref.TableName{2}))
303 % This seems to be a special structure such that for
304 % Pref.TableName = {'tr
','td
'} 's
' corresponds to
305 % <tr> <td>M11</td> <td>M12</td> </tr>
306 % <tr> <td>M12</td> <td>M22</td> </tr>
307 % Recognize it as encoding for 2D struct
310 row = tr{r}.(Pref.TableName{2});
311 Table(r,1:length(row)) = row; %#ok<AGROW>
317 % --- Post-processing: convert 'struct of cell-arrays' to 'array of structs'
318 % Example: let say s has 3 fields s.a, s.b & s.c and each field
is an
319 % cell-array with more than one cell-element and all 3 have the same length.
320 % Then change it to array of structs, each with single cell.
321 % This way element s.a{1} will be now accessed through s(1).a
322 vec = zeros(size(fields));
323 for i=1:nField, vec(i) = f.(fields{i}); end
324 if (numel(vec)>1 && vec(1)>1 && var(vec)==0) % convert from
struct of
325 s = cell2struct(struct2cell(s), fields, 1); % arrays to array of
struct
326 end %
if anyone knows better way to
do above conversion please let me know.
330%% === Step 5: Process
nodes with attributes =============================
331if (node.hasAttributes && Pref.ReadAttr)
332 if (~isstruct(s)), % make into
struct if is not already
336 Attr = node.getAttributes; % list of all attributes
337 for iAttr = 1:Attr.getLength %
for each attribute
338 name = char(Attr.item(iAttr-1).getName); % attribute name
339 name = str2varName(name, Pref.KeepNS); % fix name
if needed
340 value = char(Attr.item(iAttr-1).getValue); % attribute value
341 value = str2var(value, Pref.Str2Num, 1); % convert to number
if possible
342 s.ATTRIBUTE.(name) = value; % save again
344end % done with attributes
345if (~isstruct(s)),
return; end %The rest of the code deals with
struct's
347%% === Post-processing: fields of "s"
348% convert 'cell-array of structs
' to 'arrays of structs
'
349fields = fieldnames(s); % get field names
350nField = length(fields);
351for iItem=1:length(s) % for each struct in the array - usually one
352 for iField=1:length(fields)
353 field = fields{iField}; % get field name
354 % if this is an 'item
' field and user want to leave those as cells
356 if (strcmpi(field, Pref.ItemName) && Pref.CellItem), continue; end
357 x = s(iItem).(field);
358 if (iscell(x) && all(cellfun(@isstruct,x(:))) && numel(x)>1) % it's cell-array of structs
359 % numel(x)>1 check
is to keep 1 cell-arrays created when Pref.CellItem=1
360 try %
this operation fails sometimes
361 % example: change s(1).a{1}.b=
'jack'; s(1).a{2}.b=
'john'; to
362 % more convinient s(1).a(1).b=
'jack'; s(1).a(2).b=
'john';
363 s(iItem).(field) = [x{:}]
'; %#ok<AGROW> % converted to arrays of structs
365 % above operation will fail if s(1).a{1} and s(1).a{2} have
366 % different fields. If desired, function forceCell2Struct can force
367 % them to the same field structure by adding empty fields.
369 s(iItem).(field) = forceCell2Struct(x); %#ok<AGROW>
376%% === Step 4: Post-process struct's created
for nodes with children =====
378% --- Post-processing: remove special
'item' tags ---------------------
379% many xml writes (including xml_write) use a special keyword to mark
380% arrays of
nodes (see xml_write
for examples). The code below converts
383if (isfield(s,Pref.ItemName))
384 s.CONTENT = s.(Pref.ItemName);
385 s = rmfield(s,Pref.ItemName);
386 ItemContent = Pref.CellItem; %
if CellItem than keep s.CONTENT as cells
389% --- Post-processing: clean up CONTENT tags ---------------------
390%
if s.CONTENT
is a cell-array with empty elements at the end than trim
391% the length of
this cell-array. Also
if s.CONTENT
is the only field than
392% remove .CONTENT part and store it as s.
393if (isfield(s,
'CONTENT'))
394 if (iscell(s.CONTENT) && isvector(s.CONTENT))
396 for i=numel(x):-1:1, if ~isempty(x{i}), break; end; end
397 if (i==1 && ~ItemContent)
398 s.CONTENT = x{1}; %
delete cell structure
400 s.CONTENT = x(1:i); %
delete empty cells
405 ss = s.CONTENT; % only child: remove a level but ensure output
is a cell-array
408 s = s.CONTENT; % only child: remove a level
415%% =======================================================================
416% === forceCell2Struct Function =========================================
417% =======================================================================
418function s = forceCell2Struct(x)
419% Convert cell-array of structs, where not all of structs have the same
420% fields, to a single array of structs
422%% Convert 1D cell array of structs to 2D cell array, where each row
423% represents item in original array and each column corresponds to a unique
424% field name. Array
"AllFields" store fieldnames
for each column
425AllFields = fieldnames(x{1}); % get field names of the first
struct
426CellMat = cell(length(x), length(AllFields));
428 fields = fieldnames(x{iItem}); % get field names of the next
struct
429 for iField=1:length(fields) % inspect all fieldnames and find those
430 field = fields{iField}; % get field name
431 col = find(strcmp(field,AllFields),1);
432 if isempty(col) % no column
for such fieldname yet
433 AllFields = [AllFields; field]; %#ok<AGROW>
434 col = length(AllFields); % create a
new column
for it
436 CellMat{iItem,col} = x{iItem}.(field); % store rearanged data
439%% Convert 2D cell array to array of structs
440s = cell2struct(CellMat, AllFields, 2);
442%% =======================================================================
443% === str2var Function ==================================================
444% =======================================================================
445function val=str2var(str, option, attribute)
446% Can
this string 'str' be converted to a number?
if so than
do it.
449if (len==0 || option==0),
return; end % Str2Num=
"never" of empty
string ->
do not
do enything
450if (len>10000 && option==1),
return; end % Str2Num=
"smart" and
string is very
long -> probably base64 encoded binary
451digits =
'(Inf)|(NaN)|(pi)|[\t\n\d\+\-\*\.ei EI\[\]\;\,]';
452s = regexprep(str, digits,
''); % remove all the digits and other allowed characters
453if (~all(~isempty(s))) %
if nothing left than
this is probably a number
454 if (contains(str,
' ')), option=2; end %
if str has white-spaces assume by
default that it
is not a date
string
455 if (contains(str,
'[')), option=2; end % same with brackets
456 str(strfind(str,
'\n')) =
';';% parse data tables into 2D arrays,
if any
457 if (option==1) % the
'smart' option
458 try %
try to convert to a date, like 2007-12-05
459 datenum(str); %
if successful than leave it as
string
460 catch %#ok<CTCH> %
if this is not a date than ...
461 option=2; % ...
try converting to a number
466 num = str2double(str); %
try converting to a single number
using sscanf function
467 if isnan(num),
return; end % So, it wasn
't really a number after all
469 num = str2num(str); %#ok<ST2NM> % try converting to a single number or array using eval function
471 if(isnumeric(num) && numel(num)>0), val=num; end % if convertion to a single was succesful than save
473elseif ((str(1)=='[
' && str(end)==']
') || (str(1)=='{
' && str(end)=='}
')) % this looks like a (cell) array encoded as a string
479elseif (~attribute) % see if it is a boolean array with no [] brackets
481 str1 = strrep(str1, 'false', '0
');
482 str1 = strrep(str1, 'true' , '1');
483 s = regexprep(str1,
'[01 \;\,]',
''); % remove all 0/1, spaces, commas and semicolons
484 if (~all(~isempty(s))) %
if nothing left than
this is probably a
boolean array
485 num = str2num(str1); %#ok<ST2NM>
486 if(isnumeric(num) && numel(num)>0), val = (num>0); end %
if convertion was succesful than save as logical
491%% =======================================================================
492% === str2varName Function ==============================================
493% =======================================================================
494function str = str2varName(str, KeepNS)
495% convert a sting to a valid matlab variable name
497 str = regexprep(str,
':',
'_COLON_',
'once',
'ignorecase');
499 k = strfind(str,
':');
504str = regexprep(str,
'-',
'_DASH_' ,
'once',
'ignorecase');
505if (~isvarname(str)) && (~iskeyword(str))
506 str = genvarname(str);
509%% =======================================================================
510% === NodeName Function =================================================
511% =======================================================================
512function [Name LeafNode] = NodeName(node, KeepNS)
513% get node name and make sure it
is a valid variable name in Matlab.
515% LeafNode=0 - normal element node,
516% LeafNode=1 - text node
517% LeafNode=2 - supported non-text leaf node,
518% LeafNode=3 - supported processing instructions leaf node,
519% LeafNode=-1 - unsupported non-text leaf node
520switch (node.getNodeType)
521 case node.ELEMENT_NODE
522 Name = char(node.getNodeName);% capture name of the node
523 Name = str2varName(Name, KeepNS); %
if Name
is not a good variable name - fix it
528 case node.COMMENT_NODE
531 case node.CDATA_SECTION_NODE
532 Name =
'CDATA_SECTION';
534 case node.DOCUMENT_TYPE_NODE
535 Name =
'DOCUMENT_TYPE';
537 case node.PROCESSING_INSTRUCTION_NODE
538 Name =
'PROCESSING_INSTRUCTION';
541 NodeType = {
'ELEMENT',
'ATTRIBUTE',
'TEXT',
'CDATA_SECTION', ...
542 'ENTITY_REFERENCE',
'ENTITY',
'PROCESSING_INSTRUCTION',
'COMMENT',...
543 'DOCUMENT',
'DOCUMENT_TYPE',
'DOCUMENT_FRAGMENT',
'NOTATION'};
544 Name = char(node.getNodeName);% capture name of the node
545 warning(
'xml_io_tools:read:unkNode', ...
546 'Unknown node type encountered: %s_NODE (%s)', NodeType{node.getNodeType}, Name);