aboutsummaryrefslogtreecommitdiff
blob: bd8c6c5442309a26ba523e0506e2b3a25fd539c6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
/*
 * Copyright (C) 2009-2010 Red Hat, Inc.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
 *
 * Authors:
 *     Mark McLoughlin <markmc@redhat.com>
 */

#include <config.h>

#include "pci.h"

#include <dirent.h>
#include <fcntl.h>
#include <inttypes.h>
#include <limits.h>
#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>

#include "logging.h"
#include "memory.h"
#include "util.h"
#include "virterror_internal.h"
#include "files.h"

/* avoid compilation breakage on some systems */
#ifndef MODPROBE
# define MODPROBE "modprobe"
#endif

#define PCI_SYSFS "/sys/bus/pci/"
#define PCI_ID_LEN 10   /* "XXXX XXXX" */
#define PCI_ADDR_LEN 13 /* "XXXX:XX:XX.X" */

struct _pciDevice {
    unsigned      domain;
    unsigned      bus;
    unsigned      slot;
    unsigned      function;

    char          name[PCI_ADDR_LEN]; /* domain:bus:slot.function */
    char          id[PCI_ID_LEN];     /* product vendor */
    char          path[PATH_MAX];
    int           fd;

    unsigned      initted;
    unsigned      pcie_cap_pos;
    unsigned      pci_pm_cap_pos;
    unsigned      has_flr : 1;
    unsigned      has_pm_reset : 1;
    unsigned      managed : 1;
};

struct _pciDeviceList {
    unsigned count;
    pciDevice **devs;
};


/* For virReportOOMError()  and virReportSystemError() */
#define VIR_FROM_THIS VIR_FROM_NONE

#define pciReportError(code, ...)                              \
    virReportErrorHelper(NULL, VIR_FROM_NONE, code, __FILE__,  \
                         __FUNCTION__, __LINE__, __VA_ARGS__)

/* Specifications referenced in comments:
 *  PCI30  - PCI Local Bus Specification 3.0
 *  PCIe20 - PCI Express Base Specification 2.0
 *  BR12   - PCI-to-PCI Bridge Architecture Specification 1.2
 *  PM12   - PCI Bus Power Management Interface Specification 1.2
 *  ECN_AF - Advanced Capabilities for Conventional PCI ECN
 */

/* Type 0 config space header length; PCI30 Section 6.1 Configuration Space Organization */
#define PCI_CONF_LEN            0x100
#define PCI_CONF_HEADER_LEN     0x40

/* PCI30 6.2.1 */
#define PCI_HEADER_TYPE         0x0e    /* Header type */
#define PCI_HEADER_TYPE_BRIDGE 0x1
#define PCI_HEADER_TYPE_MASK   0x7f
#define PCI_HEADER_TYPE_MULTI  0x80

/* PCI30 6.2.1  Device Identification */
#define PCI_CLASS_DEVICE        0x0a    /* Device class */

/* Class Code for bridge; PCI30 D.7  Base Class 06h */
#define PCI_CLASS_BRIDGE_PCI    0x0604

/* PCI30 6.2.3  Device Status */
#define PCI_STATUS              0x06    /* 16 bits */
#define PCI_STATUS_CAP_LIST    0x10    /* Support Capability List */

/* PCI30 6.7  Capabilities List */
#define PCI_CAPABILITY_LIST     0x34    /* Offset of first capability list entry */

/* PM12 3.2.1  Capability Identifier */
#define PCI_CAP_ID_PM           0x01    /* Power Management */
/* PCI30 H Capability IDs */
#define PCI_CAP_ID_EXP          0x10    /* PCI Express */
/* ECN_AF 6.x.1.1  Capability ID for AF */
#define PCI_CAP_ID_AF           0x13    /* Advanced Features */

/* PCIe20 7.8.3  Device Capabilities Register (Offset 04h) */
#define PCI_EXP_DEVCAP          0x4     /* Device capabilities */
#define PCI_EXP_DEVCAP_FLR     (1<<28) /* Function Level Reset */

/* Header type 1 BR12 3.2 PCI-to-PCI Bridge Configuration Space Header Format */
#define PCI_PRIMARY_BUS         0x18    /* BR12 3.2.5.2 Primary bus number */
#define PCI_SECONDARY_BUS       0x19    /* BR12 3.2.5.3 Secondary bus number */
#define PCI_SUBORDINATE_BUS     0x1a    /* BR12 3.2.5.4 Highest bus number behind the bridge */
#define PCI_BRIDGE_CONTROL      0x3e
/* BR12 3.2.5.18  Bridge Control Register */
#define PCI_BRIDGE_CTL_RESET   0x40    /* Secondary bus reset */

/* PM12 3.2.4  Power Management Control/Status (Offset = 4) */
#define PCI_PM_CTRL                4    /* PM control and status register */
#define PCI_PM_CTRL_STATE_MASK    0x3  /* Current power state (D0 to D3) */
#define PCI_PM_CTRL_STATE_D0      0x0  /* D0 state */
#define PCI_PM_CTRL_STATE_D3hot   0x3  /* D3 state */
#define PCI_PM_CTRL_NO_SOFT_RESET 0x8  /* No reset for D3hot->D0 */

/* ECN_AF 6.x.1  Advanced Features Capability Structure */
#define PCI_AF_CAP              0x3     /* Advanced features capabilities */
#define PCI_AF_CAP_FLR         0x2     /* Function Level Reset */

#define PCI_EXP_FLAGS           0x2
#define PCI_EXP_FLAGS_TYPE      0x00f0
#define PCI_EXP_TYPE_DOWNSTREAM 0x6

#define PCI_EXT_CAP_BASE          0x100
#define PCI_EXT_CAP_LIMIT         0x1000
#define PCI_EXT_CAP_ID_MASK       0x0000ffff
#define PCI_EXT_CAP_OFFSET_SHIFT  20
#define PCI_EXT_CAP_OFFSET_MASK   0x00000ffc

#define PCI_EXT_CAP_ID_ACS      0x000d
#define PCI_EXT_ACS_CTRL        0x06

#define PCI_EXT_CAP_ACS_SV      0x01
#define PCI_EXT_CAP_ACS_RR      0x04
#define PCI_EXT_CAP_ACS_CR      0x08
#define PCI_EXT_CAP_ACS_UF      0x10
#define PCI_EXT_CAP_ACS_ENABLED (PCI_EXT_CAP_ACS_SV | \
                                 PCI_EXT_CAP_ACS_RR | \
                                 PCI_EXT_CAP_ACS_CR | \
                                 PCI_EXT_CAP_ACS_UF)

static int
pciOpenConfig(pciDevice *dev)
{
    int fd;

    if (dev->fd > 0)
        return 0;

    fd = open(dev->path, O_RDWR);
    if (fd < 0) {
        char ebuf[1024];
        VIR_WARN("Failed to open config space file '%s': %s",
                 dev->path, virStrerror(errno, ebuf, sizeof(ebuf)));
        return -1;
    }
    VIR_DEBUG("%s %s: opened %s", dev->id, dev->name, dev->path);
    dev->fd = fd;
    return 0;
}

static void
pciCloseConfig(pciDevice *dev)
{
    if (!dev)
        return;

    VIR_FORCE_CLOSE(dev->fd);
}

static int
pciRead(pciDevice *dev, unsigned pos, uint8_t *buf, unsigned buflen)
{
    memset(buf, 0, buflen);

    if (pciOpenConfig(dev) < 0)
        return -1;

    if (lseek(dev->fd, pos, SEEK_SET) != pos ||
        saferead(dev->fd, buf, buflen) != buflen) {
        char ebuf[1024];
        VIR_WARN("Failed to read from '%s' : %s", dev->path,
                 virStrerror(errno, ebuf, sizeof(ebuf)));
        return -1;
    }
    return 0;
}

static uint8_t
pciRead8(pciDevice *dev, unsigned pos)
{
    uint8_t buf;
    pciRead(dev, pos, &buf, sizeof(buf));
    return buf;
}

static uint16_t
pciRead16(pciDevice *dev, unsigned pos)
{
    uint8_t buf[2];
    pciRead(dev, pos, &buf[0], sizeof(buf));
    return (buf[0] << 0) | (buf[1] << 8);
}

static uint32_t
pciRead32(pciDevice *dev, unsigned pos)
{
    uint8_t buf[4];
    pciRead(dev, pos, &buf[0], sizeof(buf));
    return (buf[0] << 0) | (buf[1] << 8) | (buf[2] << 16) | (buf[3] << 24);
}

static int
pciWrite(pciDevice *dev, unsigned pos, uint8_t *buf, unsigned buflen)
{
    if (pciOpenConfig(dev) < 0)
        return -1;

    if (lseek(dev->fd, pos, SEEK_SET) != pos ||
        safewrite(dev->fd, buf, buflen) != buflen) {
        char ebuf[1024];
        VIR_WARN("Failed to write to '%s' : %s", dev->path,
                 virStrerror(errno, ebuf, sizeof(ebuf)));
        return -1;
    }
    return 0;
}

static void
pciWrite16(pciDevice *dev, unsigned pos, uint16_t val)
{
    uint8_t buf[2] = { (val >> 0), (val >> 8) };
    pciWrite(dev, pos, &buf[0], sizeof(buf));
}

static void
pciWrite32(pciDevice *dev, unsigned pos, uint32_t val)
{
    uint8_t buf[4] = { (val >> 0), (val >> 8), (val >> 16), (val >> 14) };
    pciWrite(dev, pos, &buf[0], sizeof(buf));
}

typedef int (*pciIterPredicate)(pciDevice *, pciDevice *, void *);

/* Iterate over available PCI devices calling @predicate
 * to compare each one to @dev.
 * Return -1 on error since we don't want to assume it is
 * safe to reset if there is an error.
 */
static int
pciIterDevices(pciIterPredicate predicate,
               pciDevice *dev,
               pciDevice **matched,
               void *data)
{
    DIR *dir;
    struct dirent *entry;
    int ret = 0;
    int rc;

    *matched = NULL;

    VIR_DEBUG("%s %s: iterating over " PCI_SYSFS "devices", dev->id, dev->name);

    dir = opendir(PCI_SYSFS "devices");
    if (!dir) {
        VIR_WARN0("Failed to open " PCI_SYSFS "devices");
        return -1;
    }

    while ((entry = readdir(dir))) {
        unsigned int domain, bus, slot, function;
        pciDevice *check;
        char *tmp;

        /* Ignore '.' and '..' */
        if (entry->d_name[0] == '.')
            continue;

        /* expected format: <domain>:<bus>:<slot>.<function> */
        if (/* domain */
            virStrToLong_ui(entry->d_name, &tmp, 16, &domain) < 0 || *tmp != ':' ||
            /* bus */
            virStrToLong_ui(tmp + 1, &tmp, 16, &bus) < 0 || *tmp != ':' ||
            /* slot */
            virStrToLong_ui(tmp + 1, &tmp, 16, &slot) < 0 || *tmp != '.' ||
            /* function */
            virStrToLong_ui(tmp + 1, NULL, 16, &function) < 0) {
            VIR_WARN("Unusual entry in " PCI_SYSFS "devices: %s", entry->d_name);
            continue;
        }

        check = pciGetDevice(domain, bus, slot, function);
        if (!check) {
            ret = -1;
            break;
        }

        rc = predicate(dev, check, data);
        if (rc < 0) {
            /* the predicate returned an error, bail */
            pciFreeDevice(check);
            ret = -1;
            break;
        }
        else if (rc == 1) {
            VIR_DEBUG("%s %s: iter matched on %s", dev->id, dev->name, check->name);
            *matched = check;
            ret = 1;
            break;
        }

        pciFreeDevice(check);
    }
    closedir(dir);
    return ret;
}

static uint8_t
pciFindCapabilityOffset(pciDevice *dev, unsigned capability)
{
    uint16_t status;
    uint8_t pos;

    status = pciRead16(dev, PCI_STATUS);
    if (!(status & PCI_STATUS_CAP_LIST))
        return 0;

    pos = pciRead8(dev, PCI_CAPABILITY_LIST);

    /* Zero indicates last capability, capabilities can't
     * be in the config space header and 0xff is returned
     * by the kernel if we don't have access to this region
     *
     * Note: we're not handling loops or extended
     * capabilities here.
     */
    while (pos >= PCI_CONF_HEADER_LEN && pos != 0xff) {
        uint8_t capid = pciRead8(dev, pos);
        if (capid == capability) {
            VIR_DEBUG("%s %s: found cap 0x%.2x at 0x%.2x",
                      dev->id, dev->name, capability, pos);
            return pos;
        }

        pos = pciRead8(dev, pos + 1);
    }

    VIR_DEBUG("%s %s: failed to find cap 0x%.2x", dev->id, dev->name, capability);

    return 0;
}

static unsigned int
pciFindExtendedCapabilityOffset(pciDevice *dev, unsigned capability)
{
    int ttl;
    unsigned int pos;
    uint32_t header;

    /* minimum 8 bytes per capability */
    ttl = (PCI_EXT_CAP_LIMIT - PCI_EXT_CAP_BASE) / 8;
    pos = PCI_EXT_CAP_BASE;

    while (ttl > 0 && pos >= PCI_EXT_CAP_BASE) {
        header = pciRead32(dev, pos);

        if ((header & PCI_EXT_CAP_ID_MASK) == capability)
            return pos;

        pos = (header >> PCI_EXT_CAP_OFFSET_SHIFT) & PCI_EXT_CAP_OFFSET_MASK;
        ttl--;
    }

    return 0;
}

/* detects whether this device has FLR.  Returns 0 if the device does
 * not have FLR, 1 if it does, and -1 on error
 */
static int
pciDetectFunctionLevelReset(pciDevice *dev)
{
    uint32_t caps;
    uint8_t pos;
    char *path;
    int found;

    /* The PCIe Function Level Reset capability allows
     * individual device functions to be reset without
     * affecting any other functions on the device or
     * any other devices on the bus. This is only common
     * on SR-IOV NICs at the moment.
     */
    if (dev->pcie_cap_pos) {
        caps = pciRead32(dev, dev->pcie_cap_pos + PCI_EXP_DEVCAP);
        if (caps & PCI_EXP_DEVCAP_FLR) {
            VIR_DEBUG("%s %s: detected PCIe FLR capability", dev->id, dev->name);
            return 1;
        }
    }

    /* The PCI AF Function Level Reset capability is
     * the same thing, except for conventional PCI
     * devices. This is not common yet.
     */
    pos = pciFindCapabilityOffset(dev, PCI_CAP_ID_AF);
    if (pos) {
        caps = pciRead16(dev, pos + PCI_AF_CAP);
        if (caps & PCI_AF_CAP_FLR) {
            VIR_DEBUG("%s %s: detected PCI FLR capability", dev->id, dev->name);
            return 1;
        }
    }

    /* there are some buggy devices that do support FLR, but forget to
     * advertise that fact in their capabilities.  However, FLR is *required*
     * to be present for virtual functions (VFs), so if we see that this
     * device is a VF, we just assume FLR works
     */

    if (virAsprintf(&path, PCI_SYSFS "devices/%s/physfn", dev->name) < 0) {
        virReportOOMError();
        return -1;
    }

    found = virFileExists(path);
    VIR_FREE(path);
    if (found) {
        VIR_DEBUG("%s %s: buggy device didn't advertise FLR, but is a VF; forcing flr on",
                  dev->id, dev->name);
        return 1;
    }

    VIR_DEBUG("%s %s: no FLR capability found", dev->id, dev->name);

    return 0;
}

/* Require the device has the PCI Power Management capability
 * and that a D3hot->D0 transition will results in a full
 * internal reset, not just a soft reset.
 */
static unsigned
pciDetectPowerManagementReset(pciDevice *dev)
{
    if (dev->pci_pm_cap_pos) {
        uint32_t ctl;

        /* require the NO_SOFT_RESET bit is clear */
        ctl = pciRead32(dev, dev->pci_pm_cap_pos + PCI_PM_CTRL);
        if (!(ctl & PCI_PM_CTRL_NO_SOFT_RESET)) {
            VIR_DEBUG("%s %s: detected PM reset capability", dev->id, dev->name);
            return 1;
        }
    }

    VIR_DEBUG("%s %s: no PM reset capability found", dev->id, dev->name);

    return 0;
}

/* Any active devices on the same domain/bus ? */
static int
pciSharesBusWithActive(pciDevice *dev, pciDevice *check, void *data)
{
    pciDeviceList *inactiveDevs = data;

    /* Different domain, different bus, or simply identical device */
    if (dev->domain != check->domain ||
        dev->bus != check->bus ||
        (dev->slot == check->slot &&
         dev->function == check->function))
        return 0;

    /* same bus, but inactive, i.e. about to be assigned to guest */
    if (inactiveDevs && pciDeviceListFind(inactiveDevs, check))
        return 0;

    return 1;
}

static pciDevice *
pciBusContainsActiveDevices(pciDevice *dev,
                            pciDeviceList *inactiveDevs)
{
    pciDevice *active = NULL;
    if (pciIterDevices(pciSharesBusWithActive,
                       dev, &active, inactiveDevs) < 0)
        return NULL;
    return active;
}

/* Is @check the parent of @dev ? */
static int
pciIsParent(pciDevice *dev, pciDevice *check, void *data)
{
    uint16_t device_class;
    uint8_t header_type, secondary, subordinate;
    pciDevice **best = data;

    if (dev->domain != check->domain)
        return 0;

    /* Is it a bridge? */
    device_class = pciRead16(check, PCI_CLASS_DEVICE);
    if (device_class != PCI_CLASS_BRIDGE_PCI)
        return 0;

    /* Is it a plane? */
    header_type = pciRead8(check, PCI_HEADER_TYPE);
    if ((header_type & PCI_HEADER_TYPE_MASK) != PCI_HEADER_TYPE_BRIDGE)
        return 0;

    secondary   = pciRead8(check, PCI_SECONDARY_BUS);
    subordinate = pciRead8(check, PCI_SUBORDINATE_BUS);

    VIR_DEBUG("%s %s: found parent device %s", dev->id, dev->name, check->name);

    /* if the secondary bus exactly equals the device's bus, then we found
     * the direct parent.  No further work is necessary
     */
    if (dev->bus == secondary)
        return 1;

    /* otherwise, SRIOV allows VFs to be on different busses then their PFs.
     * In this case, what we need to do is look for the "best" match; i.e.
     * the most restrictive match that still satisfies all of the conditions.
     */
    if (dev->bus > secondary && dev->bus <= subordinate) {
        if (*best == NULL) {
            *best = pciGetDevice(check->domain, check->bus, check->slot,
                                 check->function);
            if (*best == NULL)
                return -1;
        }
        else {
            /* OK, we had already recorded a previous "best" match for the
             * parent.  See if the current device is more restrictive than the
             * best, and if so, make it the new best
             */
            if (secondary > pciRead8(*best, PCI_SECONDARY_BUS)) {
                pciFreeDevice(*best);
                *best = pciGetDevice(check->domain, check->bus, check->slot,
                                     check->function);
                if (*best == NULL)
                    return -1;
            }
        }
    }

    return 0;
}

static int
pciGetParentDevice(pciDevice *dev, pciDevice **parent)
{
    pciDevice *best = NULL;
    int ret;

    *parent = NULL;
    ret = pciIterDevices(pciIsParent, dev, parent, &best);
    if (ret == 1)
        pciFreeDevice(best);
    else if (ret == 0)
        *parent = best;
    return ret;
}

/* Secondary Bus Reset is our sledgehammer - it resets all
 * devices behind a bus.
 */
static int
pciTrySecondaryBusReset(pciDevice *dev,
                        pciDeviceList *inactiveDevs)
{
    pciDevice *parent, *conflict;
    uint8_t config_space[PCI_CONF_LEN];
    uint16_t ctl;
    int ret = -1;

    /* For now, we just refuse to do a secondary bus reset
     * if there are other devices/functions behind the bus.
     * In future, we could allow it so long as those devices
     * are not in use by the host or other guests.
     */
    if ((conflict = pciBusContainsActiveDevices(dev, inactiveDevs))) {
        pciReportError(VIR_ERR_NO_SUPPORT,
                       _("Active %s devices on bus with %s, not doing bus reset"),
                       conflict->name, dev->name);
        return -1;
    }

    /* Find the parent bus */
    if (pciGetParentDevice(dev, &parent) < 0)
        return -1;
    if (!parent) {
        pciReportError(VIR_ERR_NO_SUPPORT,
                       _("Failed to find parent device for %s"),
                       dev->name);
        return -1;
    }

    VIR_DEBUG("%s %s: doing a secondary bus reset", dev->id, dev->name);

    /* Save and restore the device's config space; we only do this
     * for the supplied device since we refuse to do a reset if there
     * are multiple devices/functions
     */
    if (pciRead(dev, 0, config_space, PCI_CONF_LEN) < 0) {
        pciReportError(VIR_ERR_NO_SUPPORT,
                       _("Failed to read PCI config space for %s"),
                       dev->name);
        goto out;
    }

    /* Read the control register, set the reset flag, wait 200ms,
     * unset the reset flag and wait 200ms.
     */
    ctl = pciRead16(dev, PCI_BRIDGE_CONTROL);

    pciWrite16(parent, PCI_BRIDGE_CONTROL, ctl | PCI_BRIDGE_CTL_RESET);

    usleep(200 * 1000); /* sleep 200ms */

    pciWrite16(parent, PCI_BRIDGE_CONTROL, ctl);

    usleep(200 * 1000); /* sleep 200ms */

    if (pciWrite(dev, 0, config_space, PCI_CONF_LEN) < 0) {
        pciReportError(VIR_ERR_NO_SUPPORT,
                       _("Failed to restore PCI config space for %s"),
                       dev->name);
        goto out;
    }
    ret = 0;
out:
    pciFreeDevice(parent);
    return ret;
}

/* Power management reset attempts to reset a device using a
 * D-state transition from D3hot to D0. Note, in detect_pm_reset()
 * above we require the device supports a full internal reset.
 */
static int
pciTryPowerManagementReset(pciDevice *dev)
{
    uint8_t config_space[PCI_CONF_LEN];
    uint32_t ctl;

    if (!dev->pci_pm_cap_pos)
        return -1;

    /* Save and restore the device's config space. */
    if (pciRead(dev, 0, &config_space[0], PCI_CONF_LEN) < 0) {
        pciReportError(VIR_ERR_NO_SUPPORT,
                       _("Failed to read PCI config space for %s"),
                       dev->name);
        return -1;
    }

    VIR_DEBUG("%s %s: doing a power management reset", dev->id, dev->name);

    ctl = pciRead32(dev, dev->pci_pm_cap_pos + PCI_PM_CTRL);
    ctl &= ~PCI_PM_CTRL_STATE_MASK;

    pciWrite32(dev, dev->pci_pm_cap_pos + PCI_PM_CTRL, ctl|PCI_PM_CTRL_STATE_D3hot);

    usleep(10 * 1000); /* sleep 10ms */

    pciWrite32(dev, dev->pci_pm_cap_pos + PCI_PM_CTRL, ctl|PCI_PM_CTRL_STATE_D0);

    usleep(10 * 1000); /* sleep 10ms */

    if (pciWrite(dev, 0, &config_space[0], PCI_CONF_LEN) < 0) {
        pciReportError(VIR_ERR_NO_SUPPORT,
                       _("Failed to restore PCI config space for %s"),
                       dev->name);
        return -1;
    }

    return 0;
}

static int
pciInitDevice(pciDevice *dev)
{
    int flr;

    if (pciOpenConfig(dev) < 0) {
        virReportSystemError(errno,
                             _("Failed to open config space file '%s'"),
                             dev->path);
        return -1;
    }

    dev->pcie_cap_pos   = pciFindCapabilityOffset(dev, PCI_CAP_ID_EXP);
    dev->pci_pm_cap_pos = pciFindCapabilityOffset(dev, PCI_CAP_ID_PM);
    flr = pciDetectFunctionLevelReset(dev);
    if (flr < 0)
        return flr;
    dev->has_flr        = flr;
    dev->has_pm_reset   = pciDetectPowerManagementReset(dev);
    dev->initted        = 1;
    return 0;
}

int
pciResetDevice(pciDevice *dev,
               pciDeviceList *activeDevs,
               pciDeviceList *inactiveDevs)
{
    int ret = -1;

    if (activeDevs && pciDeviceListFind(activeDevs, dev)) {
        pciReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Not resetting active device %s"), dev->name);
        return -1;
    }

    if (!dev->initted && pciInitDevice(dev) < 0)
        return -1;

    /* KVM will perform FLR when starting and stopping
     * a guest, so there is no need for us to do it here.
     */
    if (dev->has_flr)
        return 0;

    /* If the device supports PCI power management reset,
     * that's the next best thing because it only resets
     * the function, not the whole device.
     */
    if (dev->has_pm_reset)
        ret = pciTryPowerManagementReset(dev);

    /* Bus reset is not an option with the root bus */
    if (ret < 0 && dev->bus != 0)
        ret = pciTrySecondaryBusReset(dev, inactiveDevs);

    if (ret < 0) {
        virErrorPtr err = virGetLastError();
        pciReportError(VIR_ERR_NO_SUPPORT,
                       _("Unable to reset PCI device %s: %s"),
                       dev->name,
                       err ? err->message : _("no FLR, PM reset or bus reset available"));
    }

    return ret;
}


static void
pciDriverDir(char *buf, size_t buflen, const char *driver)
{
    snprintf(buf, buflen, PCI_SYSFS "drivers/%s", driver);
}

static void
pciDriverFile(char *buf, size_t buflen, const char *driver, const char *file)
{
    snprintf(buf, buflen, PCI_SYSFS "drivers/%s/%s", driver, file);
}

static void
pciDeviceFile(char *buf, size_t buflen, const char *device, const char *file)
{
    snprintf(buf, buflen, PCI_SYSFS "devices/%s/%s", device, file);
}


static const char *
pciFindStubDriver(void)
{
    char drvpath[PATH_MAX];
    int probed = 0;

recheck:
    pciDriverDir(drvpath, sizeof(drvpath), "pci-stub");
    if (virFileExists(drvpath))
        return "pci-stub";
    pciDriverDir(drvpath, sizeof(drvpath), "pciback");
    if (virFileExists(drvpath))
        return "pciback";

    if (!probed) {
        const char *const stubprobe[] = { MODPROBE, "pci-stub", NULL };
        const char *const backprobe[] = { MODPROBE, "pciback", NULL };

        probed = 1;
        /*
         * Probing for pci-stub will succeed regardless of whether
         * on native or Xen kernels.
         * On Xen though, we want to prefer pciback, so probe
         * for that first, because that will only work on Xen
         */
        if (virRun(backprobe, NULL) < 0 &&
            virRun(stubprobe, NULL) < 0) {
            char ebuf[1024];
            VIR_WARN("failed to load pci-stub or pciback drivers: %s",
                     virStrerror(errno, ebuf, sizeof ebuf));
            return NULL;
        }

        goto recheck;
    }

    return NULL;
}


static int
pciBindDeviceToStub(pciDevice *dev, const char *driver)
{
    char drvdir[PATH_MAX];
    char path[PATH_MAX];

    /* Add the PCI device ID to the stub's dynamic ID table;
     * this is needed to allow us to bind the device to the stub.
     * Note: if the device is not currently bound to any driver,
     * stub will immediately be bound to the device. Also, note
     * that if a new device with this ID is hotplugged, or if a probe
     * is triggered for such a device, it will also be immediately
     * bound by the stub.
     */
    pciDriverFile(path, sizeof(path), driver, "new_id");
    if (virFileWriteStr(path, dev->id) < 0) {
        virReportSystemError(errno,
                             _("Failed to add PCI device ID '%s' to %s"),
                             dev->id, driver);
        return -1;
    }

    /* If the device is already bound to a driver, unbind it.
     * Note, this will have rather unpleasant side effects if this
     * PCI device happens to be IDE controller for the disk hosting
     * your root filesystem.
     */
    pciDeviceFile(path, sizeof(path), dev->name, "driver/unbind");
    if (virFileExists(path) && virFileWriteStr(path, dev->name) < 0) {
        virReportSystemError(errno,
                             _("Failed to unbind PCI device '%s'"), dev->name);
        return -1;
    }

    /* If the device isn't already bound to pci-stub, try binding it now.
     */
    pciDriverDir(drvdir, sizeof(drvdir), driver);
    pciDeviceFile(path, sizeof(path), dev->name, "driver");
    if (!virFileLinkPointsTo(path, drvdir)) {
        /* Xen's pciback.ko wants you to use new_slot first */
        pciDriverFile(path, sizeof(path), driver, "new_slot");
        if (virFileExists(path) && virFileWriteStr(path, dev->name) < 0) {
            virReportSystemError(errno,
                                 _("Failed to add slot for PCI device '%s' to %s"),
                                 dev->name, driver);
            return -1;
        }

        pciDriverFile(path, sizeof(path), driver, "bind");
        if (virFileWriteStr(path, dev->name) < 0) {
            virReportSystemError(errno,
                                 _("Failed to bind PCI device '%s' to %s"),
                                 dev->name, driver);
            return -1;
        }
    }

    /* If 'remove_id' exists, remove the device id from pci-stub's dynamic
     * ID table so that 'drivers_probe' works below.
     */
    pciDriverFile(path, sizeof(path), driver, "remove_id");
    if (virFileExists(path) && virFileWriteStr(path, dev->id) < 0) {
        virReportSystemError(errno,
                             _("Failed to remove PCI ID '%s' from %s"),
                             dev->id, driver);
        return -1;
    }

    return 0;
}

int
pciDettachDevice(pciDevice *dev, pciDeviceList *activeDevs)
{
    const char *driver = pciFindStubDriver();
    if (!driver) {
        pciReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                       _("cannot find any PCI stub module"));
        return -1;
    }

    if (activeDevs && pciDeviceListFind(activeDevs, dev)) {
        pciReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Not detaching active device %s"), dev->name);
        return -1;
    }

    return pciBindDeviceToStub(dev, driver);
}

static int
pciUnBindDeviceFromStub(pciDevice *dev, const char *driver)
{
    char drvdir[PATH_MAX];
    char path[PATH_MAX];

    /* If the device is bound to stub, unbind it.
     */
    pciDriverDir(drvdir, sizeof(drvdir), driver);
    pciDeviceFile(path, sizeof(path), dev->name, "driver");
    if (virFileExists(drvdir) && virFileLinkPointsTo(path, drvdir)) {
        pciDriverFile(path, sizeof(path), driver, "unbind");
        if (virFileWriteStr(path, dev->name) < 0) {
            virReportSystemError(errno,
                                 _("Failed to bind PCI device '%s' to %s"),
                                 dev->name, driver);
            return -1;
        }
    }

    /* Xen's pciback.ko wants you to use remove_slot on the specific device */
    pciDriverFile(path, sizeof(path), driver, "remove_slot");
    if (virFileExists(path) && virFileWriteStr(path, dev->name) < 0) {
        virReportSystemError(errno,
                             _("Failed to remove slot for PCI device '%s' to %s"),
                             dev->name, driver);
        return -1;
    }


    /* Trigger a re-probe of the device is not in the stub's dynamic
     * ID table. If the stub is available, but 'remove_id' isn't
     * available, then re-probing would just cause the device to be
     * re-bound to the stub.
     */
    pciDriverFile(path, sizeof(path), driver, "remove_id");
    if (!virFileExists(drvdir) || virFileExists(path)) {
        if (virFileWriteStr(PCI_SYSFS "drivers_probe", dev->name) < 0) {
            virReportSystemError(errno,
                                 _("Failed to trigger a re-probe for PCI device '%s'"),
                                 dev->name);
            return -1;
        }
    }

    return 0;
}

int
pciReAttachDevice(pciDevice *dev, pciDeviceList *activeDevs)
{
    const char *driver = pciFindStubDriver();
    if (!driver) {
        pciReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                       _("cannot find any PCI stub module"));
        return -1;
    }

    if (activeDevs && pciDeviceListFind(activeDevs, dev)) {
        pciReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Not reattaching active device %s"), dev->name);
        return -1;
    }

    return pciUnBindDeviceFromStub(dev, driver);
}

/* Certain hypervisors (like qemu/kvm) map the PCI bar(s) on
 * the host when doing device passthrough.  This can lead to a race
 * condition where the hypervisor is still cleaning up the device while
 * libvirt is trying to re-attach it to the host device driver.  To avoid
 * this situation, we look through /proc/iomem, and if the hypervisor is
 * still holding onto the bar (denoted by the string in the matcher variable),
 * then we can wait around a bit for that to clear up.
 *
 * A typical /proc/iomem looks like this (snipped for brevity):
 * 00010000-0008efff : System RAM
 * 0008f000-0008ffff : reserved
 * ...
 * 00100000-cc9fcfff : System RAM
 *   00200000-00483d3b : Kernel code
 *   00483d3c-005c88df : Kernel data
 * cc9fd000-ccc71fff : ACPI Non-volatile Storage
 * ...
 * d0200000-d02fffff : PCI Bus #05
 *   d0200000-d021ffff : 0000:05:00.0
 *     d0200000-d021ffff : e1000e
 *   d0220000-d023ffff : 0000:05:00.0
 *     d0220000-d023ffff : e1000e
 * ...
 * f0000000-f0003fff : 0000:00:1b.0
 *   f0000000-f0003fff : kvm_assigned_device
 *
 * Returns 0 if we are clear to continue, and 1 if the hypervisor is still
 * holding onto the resource.
 */
int
pciWaitForDeviceCleanup(pciDevice *dev, const char *matcher)
{
    FILE *fp;
    char line[160];
    char *tmp;
    unsigned long long start, end;
    unsigned int domain, bus, slot, function;
    int in_matching_device;
    int ret;
    size_t match_depth;

    fp = fopen("/proc/iomem", "r");
    if (!fp) {
        /* If we failed to open iomem, we just basically ignore the error.  The
         * unbind might succeed anyway, and besides, it's very likely we have
         * no way to report the error
         */
        VIR_DEBUG0("Failed to open /proc/iomem, trying to continue anyway");
        return 0;
    }

    ret = 0;
    in_matching_device = 0;
    match_depth = 0;
    while (fgets(line, sizeof(line), fp) != 0) {
        /* the logic here is a bit confusing.  For each line, we look to
         * see if it matches the domain:bus:slot.function we were given.
         * If this line matches the DBSF, then any subsequent lines indented
         * by 2 spaces are the PCI regions for this device.  It's also
         * possible that none of the PCI regions are currently mapped, in
         * which case we have no indented regions.  This code handles all
         * of these situations
         */
        if (in_matching_device && (strspn(line, " ") == (match_depth + 2))) {
            /* expected format: <start>-<end> : <suffix> */
            if (/* start */
                virStrToLong_ull(line, &tmp, 16, &start) < 0 || *tmp != '-' ||
                /* end */
                virStrToLong_ull(tmp + 1, &tmp, 16, &end) < 0 ||
                (tmp = STRSKIP(tmp, " : ")) == NULL)
                continue;

            if (STRPREFIX(tmp, matcher)) {
                ret = 1;
                break;
            }
        }
        else {
            in_matching_device = 0;

            /* expected format: <start>-<end> : <domain>:<bus>:<slot>.<function> */
            if (/* start */
                virStrToLong_ull(line, &tmp, 16, &start) < 0 || *tmp != '-' ||
                /* end */
                virStrToLong_ull(tmp + 1, &tmp, 16, &end) < 0 ||
                (tmp = STRSKIP(tmp, " : ")) == NULL ||
                /* domain */
                virStrToLong_ui(tmp, &tmp, 16, &domain) < 0 || *tmp != ':' ||
                /* bus */
                virStrToLong_ui(tmp + 1, &tmp, 16, &bus) < 0 || *tmp != ':' ||
                /* slot */
                virStrToLong_ui(tmp + 1, &tmp, 16, &slot) < 0 || *tmp != '.' ||
                /* function */
                virStrToLong_ui(tmp + 1, &tmp, 16, &function) < 0 || *tmp != '\n')
                continue;

            if (domain != dev->domain || bus != dev->bus || slot != dev->slot ||
                function != dev->function)
                continue;
            in_matching_device = 1;
            match_depth = strspn(line, " ");
        }
    }

    fclose(fp);

    return ret;
}

static char *
pciReadDeviceID(pciDevice *dev, const char *id_name)
{
    char path[PATH_MAX];
    char *id_str;

    snprintf(path, sizeof(path), PCI_SYSFS "devices/%s/%s",
             dev->name, id_name);

    /* ID string is '0xNNNN\n' ... i.e. 7 bytes */
    if (virFileReadAll(path, 7, &id_str) < 0)
        return NULL;

    /* Check for 0x suffix */
    if (id_str[0] != '0' || id_str[1] != 'x') {
        VIR_FREE(id_str);
        return NULL;
    }

    /* Chop off the newline; we know the string is 7 bytes */
    id_str[6] = '\0';

    return id_str;
}

pciDevice *
pciGetDevice(unsigned domain,
             unsigned bus,
             unsigned slot,
             unsigned function)
{
    pciDevice *dev;
    char *vendor, *product;

    if (VIR_ALLOC(dev) < 0) {
        virReportOOMError();
        return NULL;
    }

    dev->fd       = -1;
    dev->domain   = domain;
    dev->bus      = bus;
    dev->slot     = slot;
    dev->function = function;

    snprintf(dev->name, sizeof(dev->name), "%.4x:%.2x:%.2x.%.1x",
             dev->domain, dev->bus, dev->slot, dev->function);
    snprintf(dev->path, sizeof(dev->path),
             PCI_SYSFS "devices/%s/config", dev->name);

    if (access(dev->path, F_OK) != 0) {
        virReportSystemError(errno,
                             _("Device %s not found: could not access %s"),
                             dev->name, dev->path);
        pciFreeDevice(dev);
        return NULL;
    }

    vendor  = pciReadDeviceID(dev, "vendor");
    product = pciReadDeviceID(dev, "device");

    if (!vendor || !product) {
        pciReportError(VIR_ERR_NO_SUPPORT,
                       _("Failed to read product/vendor ID for %s"),
                       dev->name);
        VIR_FREE(product);
        VIR_FREE(vendor);
        pciFreeDevice(dev);
        return NULL;
    }

    /* strings contain '0x' prefix */
    snprintf(dev->id, sizeof(dev->id), "%s %s", &vendor[2], &product[2]);

    VIR_FREE(product);
    VIR_FREE(vendor);

    VIR_DEBUG("%s %s: initialized", dev->id, dev->name);

    return dev;
}

void
pciFreeDevice(pciDevice *dev)
{
    if (!dev)
        return;
    VIR_DEBUG("%s %s: freeing", dev->id, dev->name);
    pciCloseConfig(dev);
    VIR_FREE(dev);
}

void pciDeviceSetManaged(pciDevice *dev, unsigned managed)
{
    dev->managed = !!managed;
}

unsigned pciDeviceGetManaged(pciDevice *dev)
{
    return dev->managed;
}

pciDeviceList *
pciDeviceListNew(void)
{
    pciDeviceList *list;

    if (VIR_ALLOC(list) < 0) {
        virReportOOMError();
        return NULL;
    }

    return list;
}

void
pciDeviceListFree(pciDeviceList *list)
{
    int i;

    if (!list)
        return;

    for (i = 0; i < list->count; i++) {
        pciFreeDevice(list->devs[i]);
        list->devs[i] = NULL;
    }

    list->count = 0;
    VIR_FREE(list->devs);
    VIR_FREE(list);
}

int
pciDeviceListAdd(pciDeviceList *list,
                 pciDevice *dev)
{
    if (pciDeviceListFind(list, dev)) {
        pciReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Device %s is already in use"), dev->name);
        return -1;
    }

    if (VIR_REALLOC_N(list->devs, list->count+1) < 0) {
        virReportOOMError();
        return -1;
    }

    list->devs[list->count++] = dev;

    return 0;
}

pciDevice *
pciDeviceListGet(pciDeviceList *list,
                 int idx)
{
    if (idx >= list->count)
        return NULL;
    if (idx < 0)
        return NULL;

    return list->devs[idx];
}

int
pciDeviceListCount(pciDeviceList *list)
{
    return list->count;
}

pciDevice *
pciDeviceListSteal(pciDeviceList *list,
                   pciDevice *dev)
{
    pciDevice *ret = NULL;
    int i;

    for (i = 0; i < list->count; i++) {
        if (list->devs[i]->domain   != dev->domain ||
            list->devs[i]->bus      != dev->bus    ||
            list->devs[i]->slot     != dev->slot   ||
            list->devs[i]->function != dev->function)
            continue;

        ret = list->devs[i];

        if (i != --list->count)
            memmove(&list->devs[i],
                    &list->devs[i+1],
                    sizeof(*list->devs) * (list->count-i));

        if (VIR_REALLOC_N(list->devs, list->count) < 0) {
            ; /* not fatal */
        }

        break;
    }
    return ret;
}

void
pciDeviceListDel(pciDeviceList *list,
                 pciDevice *dev)
{
    pciDevice *ret = pciDeviceListSteal(list, dev);
    if (ret)
        pciFreeDevice(ret);
}

pciDevice *
pciDeviceListFind(pciDeviceList *list, pciDevice *dev)
{
    int i;

    for (i = 0; i < list->count; i++)
        if (list->devs[i]->domain   == dev->domain &&
            list->devs[i]->bus      == dev->bus    &&
            list->devs[i]->slot     == dev->slot   &&
            list->devs[i]->function == dev->function)
            return list->devs[i];
    return NULL;
}


int pciDeviceFileIterate(pciDevice *dev,
                         pciDeviceFileActor actor,
                         void *opaque)
{
    char *pcidir = NULL;
    char *file = NULL;
    DIR *dir = NULL;
    int ret = -1;
    struct dirent *ent;

    if (virAsprintf(&pcidir, "/sys/bus/pci/devices/%04x:%02x:%02x.%x",
                    dev->domain, dev->bus, dev->slot, dev->function) < 0) {
        virReportOOMError();
        goto cleanup;
    }

    if (!(dir = opendir(pcidir))) {
        virReportSystemError(errno,
                             _("cannot open %s"), pcidir);
        goto cleanup;
    }

    while ((ent = readdir(dir)) != NULL) {
        /* Device assignment requires:
         *   $PCIDIR/config, $PCIDIR/resource, $PCIDIR/resourceNNN, $PCIDIR/rom
         */
        if (STREQ(ent->d_name, "config") ||
            STRPREFIX(ent->d_name, "resource") ||
            STREQ(ent->d_name, "rom")) {
            if (virAsprintf(&file, "%s/%s", pcidir, ent->d_name) < 0) {
                virReportOOMError();
                goto cleanup;
            }
            if ((actor)(dev, file, opaque) < 0)
                goto cleanup;

            VIR_FREE(file);
        }
    }

    ret = 0;

cleanup:
    if (dir)
        closedir(dir);
    VIR_FREE(file);
    VIR_FREE(pcidir);
    return ret;
}

static int
pciDeviceDownstreamLacksACS(pciDevice *dev)
{
    uint16_t flags;
    uint16_t ctrl;
    unsigned int pos;

    if (!dev->initted && pciInitDevice(dev) < 0)
        return -1;

    pos = dev->pcie_cap_pos;
    if (!pos || pciRead16(dev, PCI_CLASS_DEVICE) != PCI_CLASS_BRIDGE_PCI)
        return 0;

    flags = pciRead16(dev, pos + PCI_EXP_FLAGS);
    if (((flags & PCI_EXP_FLAGS_TYPE) >> 4) != PCI_EXP_TYPE_DOWNSTREAM)
        return 0;

    pos = pciFindExtendedCapabilityOffset(dev, PCI_EXT_CAP_ID_ACS);
    if (!pos) {
        VIR_DEBUG("%s %s: downstream port lacks ACS", dev->id, dev->name);
        return 1;
    }

    ctrl = pciRead16(dev, pos + PCI_EXT_ACS_CTRL);
    if ((ctrl & PCI_EXT_CAP_ACS_ENABLED) != PCI_EXT_CAP_ACS_ENABLED) {
        VIR_DEBUG("%s %s: downstream port has ACS disabled",
                  dev->id, dev->name);
        return 1;
    }

    return 0;
}

static int
pciDeviceIsBehindSwitchLackingACS(pciDevice *dev)
{
    pciDevice *parent;

    if (pciGetParentDevice(dev, &parent) < 0)
        return -1;
    if (!parent) {
        /* if we have no parent, and this is the root bus, ACS doesn't come
         * into play since devices on the root bus can't P2P without going
         * through the root IOMMU.
         */
        if (dev->bus == 0)
            return 0;
        else {
            pciReportError(VIR_ERR_NO_SUPPORT,
                           _("Failed to find parent device for %s"),
                           dev->name);
            return -1;
        }
    }

    /* XXX we should rather fail when we can't find device's parent and
     * stop the loop when we get to root instead of just stopping when no
     * parent can be found
     */
    do {
        pciDevice *tmp;
        int acs;
        int ret;

        acs = pciDeviceDownstreamLacksACS(parent);

        if (acs) {
            pciFreeDevice(parent);
            if (acs < 0)
                return -1;
            else
                return 1;
        }

        tmp = parent;
        ret = pciGetParentDevice(parent, &parent);
        pciFreeDevice(tmp);
        if (ret < 0)
            return -1;
    } while (parent);

    return 0;
}

int pciDeviceIsAssignable(pciDevice *dev,
                          int strict_acs_check)
{
    int ret;

    /* XXX This could be a great place to actually check that a non-managed
     * device isn't in use, e.g. by checking that device is either un-bound
     * or bound to a stub driver.
     */

    ret = pciDeviceIsBehindSwitchLackingACS(dev);
    if (ret < 0)
        return 0;

    if (ret) {
        if (!strict_acs_check) {
            VIR_DEBUG("%s %s: strict ACS check disabled; device assignment allowed",
                      dev->id, dev->name);
        } else {
            pciReportError(VIR_ERR_NO_SUPPORT,
                           _("Device %s is behind a switch lacking ACS and "
                             "cannot be assigned"),
                           dev->name);
            return 0;
        }
    }

    return 1;
}